├── Data
    ├── Gene.gtf
    ├── TE.bed
    ├── test.bam
    └── test.exclusive.idx
├── LICENSE
├── README.md
├── bin
    ├── scTE
    ├── scTEATAC
    ├── scTEATAC_build
    └── scTE_build
├── docs
    └── scTE.png
├── example
    ├── Figure3
    │   ├── 0.cluster_scripts
    │   │   ├── scte
    │   │   │   ├── do_batch.sh
    │   │   │   └── scte.sh
    │   │   └── starsolo
    │   │   │   ├── do_batch.sh
    │   │   │   └── starsolo.sh
    │   ├── 1.pack.py
    │   ├── 2.norm_and_learn.py
    │   ├── 3.diffexp.py
    │   ├── 4.plots-allgenes.py
    │   ├── 4.plots-alltes.py
    │   ├── 4.plots-specific-tes.py
    │   ├── 5.marker_genes-leiden-0.2.py
    │   ├── 5.marker_genes-small-grp_cut.py
    │   ├── 5.marker_genes-small.py
    │   ├── 5.marker_genes.py
    │   └── TE_genes_id.mm10.txt.gz
    ├── Figure4.ipynb
    └── Figure6.ipynb
├── scTE
    ├── __init__.py
    ├── annotation.py
    ├── base.py
    ├── miniglbase
    │   ├── README.md
    │   ├── __init__.py
    │   ├── base_genelist.py
    │   ├── config.py
    │   ├── genelist.py
    │   ├── location.py
    │   └── utils.py
    └── scatacseq.py
├── setup.py
└── test.sh


/Data/Gene.gtf:
--------------------------------------------------------------------------------
  1 | ##description: evidence-based annotation of the mouse genome (GRCm38), version M21 (Ensembl 96)
  2 | ##provider: GENCODE
  3 | ##contact: gencode-help@ebi.ac.uk
  4 | ##format: gtf
  5 | ##date: 2019-03-27
  6 | chr1	HAVANA	gene	3073253	3074322	.	+	.	gene_id "ENSMUSG00000102693.1"; gene_type "TEC"; gene_name "4933401J01Rik"; level 2; havana_gene "OTTMUSG00000049935.1";
  7 | chr1	HAVANA	transcript	3073253	3074322	.	+	.	gene_id "ENSMUSG00000102693.1"; transcript_id "ENSMUST00000193812.1"; gene_type "TEC"; gene_name "4933401J01Rik"; transcript_type "TEC"; transcript_name "4933401J01Rik-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049935.1"; havana_transcript "OTTMUST00000127109.1";
  8 | chr1	HAVANA	exon	3073253	3074322	.	+	.	gene_id "ENSMUSG00000102693.1"; transcript_id "ENSMUST00000193812.1"; gene_type "TEC"; gene_name "4933401J01Rik"; transcript_type "TEC"; transcript_name "4933401J01Rik-201"; exon_number 1; exon_id "ENSMUSE00001343744.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049935.1"; havana_transcript "OTTMUST00000127109.1";
  9 | chr1	ENSEMBL	gene	3102016	3102125	.	+	.	gene_id "ENSMUSG00000064842.1"; gene_type "snRNA"; gene_name "Gm26206"; level 3;
 10 | chr1	ENSEMBL	transcript	3102016	3102125	.	+	.	gene_id "ENSMUSG00000064842.1"; transcript_id "ENSMUST00000082908.1"; gene_type "snRNA"; gene_name "Gm26206"; transcript_type "snRNA"; transcript_name "Gm26206-201"; level 3; transcript_support_level "NA"; tag "basic";
 11 | chr1	ENSEMBL	exon	3102016	3102125	.	+	.	gene_id "ENSMUSG00000064842.1"; transcript_id "ENSMUST00000082908.1"; gene_type "snRNA"; gene_name "Gm26206"; transcript_type "snRNA"; transcript_name "Gm26206-201"; exon_number 1; exon_id "ENSMUSE00000522066.1"; level 3; transcript_support_level "NA"; tag "basic";
 12 | chr1	HAVANA	gene	3205901	3671498	.	-	.	gene_id "ENSMUSG00000051951.5"; gene_type "protein_coding"; gene_name "Xkr4"; level 2; havana_gene "OTTMUSG00000026353.2";
 13 | chr1	HAVANA	transcript	3205901	3216344	.	-	.	gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000162897.1"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_name "Xkr4-203"; level 2; transcript_support_level "1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086625.1";
 14 | chr1	HAVANA	exon	3213609	3216344	.	-	.	gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000162897.1"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_name "Xkr4-203"; exon_number 1; exon_id "ENSMUSE00000858910.1"; level 2; transcript_support_level "1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086625.1";
 15 | chr1	HAVANA	exon	3205901	3207317	.	-	.	gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000162897.1"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_name "Xkr4-203"; exon_number 2; exon_id "ENSMUSE00000866652.1"; level 2; transcript_support_level "1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086625.1";
 16 | chr1	HAVANA	transcript	3206523	3215632	.	-	.	gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000159265.1"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_name "Xkr4-202"; level 2; transcript_support_level "1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086624.1";
 17 | chr1	HAVANA	exon	3213439	3215632	.	-	.	gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000159265.1"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_name "Xkr4-202"; exon_number 1; exon_id "ENSMUSE00000863980.1"; level 2; transcript_support_level "1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086624.1";
 18 | chr1	HAVANA	exon	3206523	3207317	.	-	.	gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000159265.1"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_name "Xkr4-202"; exon_number 2; exon_id "ENSMUSE00000867897.1"; level 2; transcript_support_level "1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086624.1";
 19 | chr1	HAVANA	transcript	3214482	3671498	.	-	.	gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
 20 | chr1	HAVANA	exon	3670552	3671498	.	-	.	gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 1; exon_id "ENSMUSE00000485541.3"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
 21 | chr1	HAVANA	CDS	3670552	3671348	.	-	0	gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 1; exon_id "ENSMUSE00000485541.3"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
 22 | chr1	HAVANA	start_codon	3671346	3671348	.	-	0	gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 1; exon_id "ENSMUSE00000485541.3"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
 23 | chr1	HAVANA	exon	3421702	3421901	.	-	.	gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 2; exon_id "ENSMUSE00000449517.3"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
 24 | chr1	HAVANA	CDS	3421702	3421901	.	-	1	gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 2; exon_id "ENSMUSE00000449517.3"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
 25 | chr1	HAVANA	exon	3214482	3216968	.	-	.	gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 3; exon_id "ENSMUSE00000448840.2"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
 26 | chr1	HAVANA	CDS	3216025	3216968	.	-	2	gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 3; exon_id "ENSMUSE00000448840.2"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
 27 | chr1	HAVANA	stop_codon	3216022	3216024	.	-	0	gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 3; exon_id "ENSMUSE00000448840.2"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
 28 | chr1	HAVANA	UTR	3671349	3671498	.	-	.	gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 1; exon_id "ENSMUSE00000485541.3"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
 29 | chr1	HAVANA	UTR	3214482	3216024	.	-	.	gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 3; exon_id "ENSMUSE00000448840.2"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
 30 | chr1	HAVANA	gene	3252757	3253236	.	+	.	gene_id "ENSMUSG00000102851.1"; gene_type "processed_pseudogene"; gene_name "Gm18956"; level 1; tag "pseudo_consens"; havana_gene "OTTMUSG00000049958.1";
 31 | chr1	HAVANA	transcript	3252757	3253236	.	+	.	gene_id "ENSMUSG00000102851.1"; transcript_id "ENSMUST00000192857.1"; gene_type "processed_pseudogene"; gene_name "Gm18956"; transcript_type "processed_pseudogene"; transcript_name "Gm18956-201"; level 1; transcript_support_level "NA"; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; havana_gene "OTTMUSG00000049958.1"; havana_transcript "OTTMUST00000127143.1";
 32 | chr1	HAVANA	exon	3252757	3253236	.	+	.	gene_id "ENSMUSG00000102851.1"; transcript_id "ENSMUST00000192857.1"; gene_type "processed_pseudogene"; gene_name "Gm18956"; transcript_type "processed_pseudogene"; transcript_name "Gm18956-201"; exon_number 1; exon_id "ENSMUSE00001339323.1"; level 1; transcript_support_level "NA"; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; havana_gene "OTTMUSG00000049958.1"; havana_transcript "OTTMUST00000127143.1";
 33 | chr1	HAVANA	gene	3365731	3368549	.	-	.	gene_id "ENSMUSG00000103377.1"; gene_type "TEC"; gene_name "Gm37180"; level 2; havana_gene "OTTMUSG00000049960.1";
 34 | chr1	HAVANA	transcript	3365731	3368549	.	-	.	gene_id "ENSMUSG00000103377.1"; transcript_id "ENSMUST00000195335.1"; gene_type "TEC"; gene_name "Gm37180"; transcript_type "TEC"; transcript_name "Gm37180-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049960.1"; havana_transcript "OTTMUST00000127145.1";
 35 | chr1	HAVANA	exon	3365731	3368549	.	-	.	gene_id "ENSMUSG00000103377.1"; transcript_id "ENSMUST00000195335.1"; gene_type "TEC"; gene_name "Gm37180"; transcript_type "TEC"; transcript_name "Gm37180-201"; exon_number 1; exon_id "ENSMUSE00001343189.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049960.1"; havana_transcript "OTTMUST00000127145.1";
 36 | chr1	HAVANA	gene	3375556	3377788	.	-	.	gene_id "ENSMUSG00000104017.1"; gene_type "TEC"; gene_name "Gm37363"; level 2; havana_gene "OTTMUSG00000049961.1";
 37 | chr1	HAVANA	transcript	3375556	3377788	.	-	.	gene_id "ENSMUSG00000104017.1"; transcript_id "ENSMUST00000192336.1"; gene_type "TEC"; gene_name "Gm37363"; transcript_type "TEC"; transcript_name "Gm37363-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049961.1"; havana_transcript "OTTMUST00000127146.1";
 38 | chr1	HAVANA	exon	3375556	3377788	.	-	.	gene_id "ENSMUSG00000104017.1"; transcript_id "ENSMUST00000192336.1"; gene_type "TEC"; gene_name "Gm37363"; transcript_type "TEC"; transcript_name "Gm37363-201"; exon_number 1; exon_id "ENSMUSE00001343686.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049961.1"; havana_transcript "OTTMUST00000127146.1";
 39 | chr1	HAVANA	gene	3464977	3467285	.	-	.	gene_id "ENSMUSG00000103025.1"; gene_type "TEC"; gene_name "Gm37686"; level 2; havana_gene "OTTMUSG00000049930.1";
 40 | chr1	HAVANA	transcript	3464977	3467285	.	-	.	gene_id "ENSMUSG00000103025.1"; transcript_id "ENSMUST00000194099.1"; gene_type "TEC"; gene_name "Gm37686"; transcript_type "TEC"; transcript_name "Gm37686-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049930.1"; havana_transcript "OTTMUST00000127101.1";
 41 | chr1	HAVANA	exon	3464977	3467285	.	-	.	gene_id "ENSMUSG00000103025.1"; transcript_id "ENSMUST00000194099.1"; gene_type "TEC"; gene_name "Gm37686"; transcript_type "TEC"; transcript_name "Gm37686-201"; exon_number 1; exon_id "ENSMUSE00001337180.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049930.1"; havana_transcript "OTTMUST00000127101.1";
 42 | chr1	HAVANA	gene	3466587	3513553	.	+	.	gene_id "ENSMUSG00000089699.1"; gene_type "antisense"; gene_name "Gm1992"; level 2; havana_gene "OTTMUSG00000026352.1";
 43 | chr1	HAVANA	transcript	3466587	3513553	.	+	.	gene_id "ENSMUSG00000089699.1"; transcript_id "ENSMUST00000161581.1"; gene_type "antisense"; gene_name "Gm1992"; transcript_type "antisense"; transcript_name "Gm1992-201"; level 2; transcript_support_level "3"; tag "basic"; havana_gene "OTTMUSG00000026352.1"; havana_transcript "OTTMUST00000065165.1";
 44 | chr1	HAVANA	exon	3466587	3466687	.	+	.	gene_id "ENSMUSG00000089699.1"; transcript_id "ENSMUST00000161581.1"; gene_type "antisense"; gene_name "Gm1992"; transcript_type "antisense"; transcript_name "Gm1992-201"; exon_number 1; exon_id "ENSMUSE00000869502.1"; level 2; transcript_support_level "3"; tag "basic"; havana_gene "OTTMUSG00000026352.1"; havana_transcript "OTTMUST00000065165.1";
 45 | chr1	HAVANA	exon	3513405	3513553	.	+	.	gene_id "ENSMUSG00000089699.1"; transcript_id "ENSMUST00000161581.1"; gene_type "antisense"; gene_name "Gm1992"; transcript_type "antisense"; transcript_name "Gm1992-201"; exon_number 2; exon_id "ENSMUSE00000864479.1"; level 2; transcript_support_level "3"; tag "basic"; havana_gene "OTTMUSG00000026352.1"; havana_transcript "OTTMUST00000065165.1";
 46 | chr1	HAVANA	gene	3512451	3514507	.	-	.	gene_id "ENSMUSG00000103201.1"; gene_type "TEC"; gene_name "Gm37329"; level 2; havana_gene "OTTMUSG00000049929.1";
 47 | chr1	HAVANA	transcript	3512451	3514507	.	-	.	gene_id "ENSMUSG00000103201.1"; transcript_id "ENSMUST00000192973.1"; gene_type "TEC"; gene_name "Gm37329"; transcript_type "TEC"; transcript_name "Gm37329-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049929.1"; havana_transcript "OTTMUST00000127100.1";
 48 | chr1	HAVANA	exon	3512451	3514507	.	-	.	gene_id "ENSMUSG00000103201.1"; transcript_id "ENSMUST00000192973.1"; gene_type "TEC"; gene_name "Gm37329"; transcript_type "TEC"; transcript_name "Gm37329-201"; exon_number 1; exon_id "ENSMUSE00001345667.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049929.1"; havana_transcript "OTTMUST00000127100.1";
 49 | chr1	HAVANA	gene	3531795	3532720	.	+	.	gene_id "ENSMUSG00000103147.1"; gene_type "processed_pseudogene"; gene_name "Gm7341"; level 1; tag "pseudo_consens"; havana_gene "OTTMUSG00000049921.1";
 50 | chr1	HAVANA	transcript	3531795	3532720	.	+	.	gene_id "ENSMUSG00000103147.1"; transcript_id "ENSMUST00000192183.1"; gene_type "processed_pseudogene"; gene_name "Gm7341"; transcript_type "processed_pseudogene"; transcript_name "Gm7341-201"; level 1; transcript_support_level "NA"; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; havana_gene "OTTMUSG00000049921.1"; havana_transcript "OTTMUST00000127089.1";
 51 | chr1	HAVANA	exon	3531795	3532720	.	+	.	gene_id "ENSMUSG00000103147.1"; transcript_id "ENSMUST00000192183.1"; gene_type "processed_pseudogene"; gene_name "Gm7341"; transcript_type "processed_pseudogene"; transcript_name "Gm7341-201"; exon_number 1; exon_id "ENSMUSE00001343235.1"; level 1; transcript_support_level "NA"; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; havana_gene "OTTMUSG00000049921.1"; havana_transcript "OTTMUST00000127089.1";
 52 | chr1	HAVANA	gene	3592892	3595903	.	-	.	gene_id "ENSMUSG00000103161.1"; gene_type "TEC"; gene_name "Gm38148"; level 2; havana_gene "OTTMUSG00000049927.1";
 53 | chr1	HAVANA	transcript	3592892	3595903	.	-	.	gene_id "ENSMUSG00000103161.1"; transcript_id "ENSMUST00000195166.1"; gene_type "TEC"; gene_name "Gm38148"; transcript_type "TEC"; transcript_name "Gm38148-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049927.1"; havana_transcript "OTTMUST00000127098.1";
 54 | chr1	HAVANA	exon	3592892	3595903	.	-	.	gene_id "ENSMUSG00000103161.1"; transcript_id "ENSMUST00000195166.1"; gene_type "TEC"; gene_name "Gm38148"; transcript_type "TEC"; transcript_name "Gm38148-201"; exon_number 1; exon_id "ENSMUSE00001343966.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049927.1"; havana_transcript "OTTMUST00000127098.1";
 55 | chr1	HAVANA	gene	3647309	3658904	.	-	.	gene_id "ENSMUSG00000102331.1"; gene_type "sense_intronic"; gene_name "Gm19938"; level 2; havana_gene "OTTMUSG00000049924.1";
 56 | chr1	HAVANA	transcript	3647309	3658904	.	-	.	gene_id "ENSMUSG00000102331.1"; transcript_id "ENSMUST00000192692.1"; gene_type "sense_intronic"; gene_name "Gm19938"; transcript_type "sense_intronic"; transcript_name "Gm19938-201"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTMUSG00000049924.1"; havana_transcript "OTTMUST00000127092.1";
 57 | chr1	HAVANA	exon	3658847	3658904	.	-	.	gene_id "ENSMUSG00000102331.1"; transcript_id "ENSMUST00000192692.1"; gene_type "sense_intronic"; gene_name "Gm19938"; transcript_type "sense_intronic"; transcript_name "Gm19938-201"; exon_number 1; exon_id "ENSMUSE00001337496.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTMUSG00000049924.1"; havana_transcript "OTTMUST00000127092.1";
 58 | chr1	HAVANA	exon	3647309	3650509	.	-	.	gene_id "ENSMUSG00000102331.1"; transcript_id "ENSMUST00000192692.1"; gene_type "sense_intronic"; gene_name "Gm19938"; transcript_type "sense_intronic"; transcript_name "Gm19938-201"; exon_number 2; exon_id "ENSMUSE00001339227.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTMUSG00000049924.1"; havana_transcript "OTTMUST00000127092.1";
 59 | chr1	HAVANA	gene	3680155	3681788	.	+	.	gene_id "ENSMUSG00000102348.1"; gene_type "TEC"; gene_name "Gm10568"; level 2; havana_gene "OTTMUSG00000049922.1";
 60 | chr1	HAVANA	transcript	3680155	3681788	.	+	.	gene_id "ENSMUSG00000102348.1"; transcript_id "ENSMUST00000193244.1"; gene_type "TEC"; gene_name "Gm10568"; transcript_type "TEC"; transcript_name "Gm10568-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049922.1"; havana_transcript "OTTMUST00000127090.1";
 61 | chr1	HAVANA	exon	3680155	3681788	.	+	.	gene_id "ENSMUSG00000102348.1"; transcript_id "ENSMUST00000193244.1"; gene_type "TEC"; gene_name "Gm10568"; transcript_type "TEC"; transcript_name "Gm10568-201"; exon_number 1; exon_id "ENSMUSE00001341983.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049922.1"; havana_transcript "OTTMUST00000127090.1";
 62 | chr1	HAVANA	gene	3752010	3754360	.	+	.	gene_id "ENSMUSG00000102592.1"; gene_type "TEC"; gene_name "Gm38385"; level 2; havana_gene "OTTMUSG00000049923.1";
 63 | chr1	HAVANA	transcript	3752010	3754360	.	+	.	gene_id "ENSMUSG00000102592.1"; transcript_id "ENSMUST00000194454.1"; gene_type "TEC"; gene_name "Gm38385"; transcript_type "TEC"; transcript_name "Gm38385-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049923.1"; havana_transcript "OTTMUST00000127091.1";
 64 | chr1	HAVANA	exon	3752010	3754360	.	+	.	gene_id "ENSMUSG00000102592.1"; transcript_id "ENSMUST00000194454.1"; gene_type "TEC"; gene_name "Gm38385"; transcript_type "TEC"; transcript_name "Gm38385-201"; exon_number 1; exon_id "ENSMUSE00001342074.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049923.1"; havana_transcript "OTTMUST00000127091.1";
 65 | chr1	ENSEMBL	gene	3783876	3783933	.	-	.	gene_id "ENSMUSG00000088333.2"; gene_type "snRNA"; gene_name "Gm27396"; level 3;
 66 | chr1	ENSEMBL	transcript	3783876	3783933	.	-	.	gene_id "ENSMUSG00000088333.2"; transcript_id "ENSMUST00000157708.2"; gene_type "snRNA"; gene_name "Gm27396"; transcript_type "snRNA"; transcript_name "Gm27396-201"; level 3; transcript_support_level "NA"; tag "basic";
 67 | chr1	ENSEMBL	exon	3783876	3783933	.	-	.	gene_id "ENSMUSG00000088333.2"; transcript_id "ENSMUST00000157708.2"; gene_type "snRNA"; gene_name "Gm27396"; transcript_type "snRNA"; transcript_name "Gm27396-201"; exon_number 1; exon_id "ENSMUSE00000846843.2"; level 3; transcript_support_level "NA"; tag "basic";
 68 | chr1	HAVANA	gene	3905739	3986215	.	-	.	gene_id "ENSMUSG00000102343.1"; gene_type "lincRNA"; gene_name "Gm37381"; level 2; havana_gene "OTTMUSG00000049934.1";
 69 | chr1	HAVANA	transcript	3905739	3986215	.	-	.	gene_id "ENSMUSG00000102343.1"; transcript_id "ENSMUST00000194643.1"; gene_type "lincRNA"; gene_name "Gm37381"; transcript_type "lincRNA"; transcript_name "Gm37381-202"; level 2; transcript_support_level "3"; tag "basic"; havana_gene "OTTMUSG00000049934.1"; havana_transcript "OTTMUST00000127107.1";
 70 | chr1	HAVANA	exon	3986147	3986215	.	-	.	gene_id "ENSMUSG00000102343.1"; transcript_id "ENSMUST00000194643.1"; gene_type "lincRNA"; gene_name "Gm37381"; transcript_type "lincRNA"; transcript_name "Gm37381-202"; exon_number 1; exon_id "ENSMUSE00001344134.1"; level 2; transcript_support_level "3"; tag "basic"; havana_gene "OTTMUSG00000049934.1"; havana_transcript "OTTMUST00000127107.1";
 71 | chr1	HAVANA	exon	3985160	3985351	.	-	.	gene_id "ENSMUSG00000102343.1"; transcript_id "ENSMUST00000194643.1"; gene_type "lincRNA"; gene_name "Gm37381"; transcript_type "lincRNA"; transcript_name "Gm37381-202"; exon_number 2; exon_id "ENSMUSE00001337703.1"; level 2; transcript_support_level "3"; tag "basic"; havana_gene "OTTMUSG00000049934.1"; havana_transcript "OTTMUST00000127107.1";
 72 | chr1	HAVANA	exon	3905739	3906134	.	-	.	gene_id "ENSMUSG00000102343.1"; transcript_id "ENSMUST00000194643.1"; gene_type "lincRNA"; gene_name "Gm37381"; transcript_type "lincRNA"; transcript_name "Gm37381-202"; exon_number 3; exon_id "ENSMUSE00001345637.1"; level 2; transcript_support_level "3"; tag "basic"; havana_gene "OTTMUSG00000049934.1"; havana_transcript "OTTMUST00000127107.1";
 73 | chr1	HAVANA	transcript	3984225	3985984	.	-	.	gene_id "ENSMUSG00000102343.1"; transcript_id "ENSMUST00000192427.1"; gene_type "lincRNA"; gene_name "Gm37381"; transcript_type "lincRNA"; transcript_name "Gm37381-201"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTMUSG00000049934.1"; havana_transcript "OTTMUST00000127108.1";
 74 | chr1	HAVANA	exon	3985160	3985984	.	-	.	gene_id "ENSMUSG00000102343.1"; transcript_id "ENSMUST00000192427.1"; gene_type "lincRNA"; gene_name "Gm37381"; transcript_type "lincRNA"; transcript_name "Gm37381-201"; exon_number 1; exon_id "ENSMUSE00001340315.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTMUSG00000049934.1"; havana_transcript "OTTMUST00000127108.1";
 75 | chr1	HAVANA	exon	3984225	3984298	.	-	.	gene_id "ENSMUSG00000102343.1"; transcript_id "ENSMUST00000192427.1"; gene_type "lincRNA"; gene_name "Gm37381"; transcript_type "lincRNA"; transcript_name "Gm37381-201"; exon_number 2; exon_id "ENSMUSE00001340468.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTMUSG00000049934.1"; havana_transcript "OTTMUST00000127108.1";
 76 | chr1	HAVANA	gene	3999557	4409241	.	-	.	gene_id "ENSMUSG00000025900.12"; gene_type "protein_coding"; gene_name "Rp1"; level 2; tag "overlapping_locus"; havana_gene "OTTMUSG00000049985.3";
 77 | chr1	HAVANA	transcript	3999557	4409241	.	-	.	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 78 | chr1	HAVANA	exon	4409170	4409241	.	-	.	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 1; exon_id "ENSMUSE00001378580.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 79 | chr1	HAVANA	CDS	4409170	4409187	.	-	0	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 1; exon_id "ENSMUSE00001378580.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 80 | chr1	HAVANA	start_codon	4409185	4409187	.	-	0	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 1; exon_id "ENSMUSE00001378580.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 81 | chr1	HAVANA	exon	4352202	4352837	.	-	.	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 2; exon_id "ENSMUSE00001403780.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 82 | chr1	HAVANA	CDS	4352202	4352837	.	-	0	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 2; exon_id "ENSMUSE00001403780.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 83 | chr1	HAVANA	exon	4351910	4352081	.	-	.	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 3; exon_id "ENSMUSE00001396015.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 84 | chr1	HAVANA	CDS	4351910	4352081	.	-	0	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 3; exon_id "ENSMUSE00001396015.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 85 | chr1	HAVANA	exon	4311270	4311433	.	-	.	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 4; exon_id "ENSMUSE00001380053.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 86 | chr1	HAVANA	CDS	4311270	4311433	.	-	2	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 4; exon_id "ENSMUSE00001380053.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 87 | chr1	HAVANA	exon	4292926	4293012	.	-	.	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 5; exon_id "ENSMUSE00001377871.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 88 | chr1	HAVANA	CDS	4292926	4293012	.	-	0	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 5; exon_id "ENSMUSE00001377871.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 89 | chr1	HAVANA	exon	4284766	4284898	.	-	.	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 6; exon_id "ENSMUSE00001379434.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 90 | chr1	HAVANA	CDS	4284766	4284898	.	-	0	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 6; exon_id "ENSMUSE00001379434.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 91 | chr1	HAVANA	exon	4267469	4267620	.	-	.	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 7; exon_id "ENSMUSE00001379919.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 92 | chr1	HAVANA	CDS	4267469	4267620	.	-	2	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 7; exon_id "ENSMUSE00001379919.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 93 | chr1	HAVANA	exon	4261527	4261605	.	-	.	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 8; exon_id "ENSMUSE00001380048.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 94 | chr1	HAVANA	CDS	4261527	4261605	.	-	0	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 8; exon_id "ENSMUSE00001380048.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 95 | chr1	HAVANA	exon	4245031	4245106	.	-	.	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 9; exon_id "ENSMUSE00001382043.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 96 | chr1	HAVANA	CDS	4245031	4245106	.	-	2	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 9; exon_id "ENSMUSE00001382043.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 97 | chr1	HAVANA	exon	4243543	4243619	.	-	.	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 10; exon_id "ENSMUSE00001379965.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 98 | chr1	HAVANA	CDS	4243543	4243619	.	-	1	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 10; exon_id "ENSMUSE00001379965.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
 99 | chr1	HAVANA	exon	4243417	4243448	.	-	.	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 11; exon_id "ENSMUSE00001379150.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
100 | chr1	HAVANA	CDS	4243417	4243448	.	-	2	gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 11; exon_id "ENSMUSE00001379150.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
101 | 


--------------------------------------------------------------------------------
/Data/TE.bed:
--------------------------------------------------------------------------------
  1 | chr3	144583200	144583342	B1_Mur4	0	-
  2 | chr6	86389924	86389960	B2_Mm2	0	+
  3 | chr7	5364171	5364232	PB1D10	0	+
  4 | chr10	55902552	55902867	LTR80B	0	+
  5 | chr12	56707313	56707382	B1F	0	+
  6 | chr2	62000937	62001039	RMER15	0	+
  7 | chr13	67837236	67837625	MTC	0	-
  8 | chr13	97860467	97860597	ID_B1	0	-
  9 | chr3	129323773	129323852	ID4_	0	-
 10 | chr15	53302093	53302237	B1_Mur4	0	-
 11 | chr3	17544777	17545068	MTE2a	0	-
 12 | chr14	114380245	114381362	Lx3A	0	-
 13 | chr14	36135784	36136221	MLT1G1	0	-
 14 | chr9	3382929	3383043	B2_Mm2	0	-
 15 | chr2	23523042	23524033	L1Md_F2	0	+
 16 | chr10	130416389	130416521	Lx7	0	+
 17 | chr10	124812631	124812919	LTR16B	0	-
 18 | chr8	121282143	121282358	ORR1G	0	-
 19 | chrX	56261784	56261888	B4A	0	-
 20 | chr12	19314026	19314159	L2a	0	-
 21 | chr13	34470884	34476084	L1Md_A	0	+
 22 | chr1	15430986	15431050	MLT1O	0	-
 23 | chr11	97176772	97176823	B4	0	+
 24 | chr6	120487970	120488131	B2_Mm2	0	-
 25 | chr2	112370309	112370404	PB1D9	0	-
 26 | chr14	11380848	11380988	L1MB7	0	+
 27 | chr7	125706670	125706784	PB1D9	0	-
 28 | chr1	119963513	119963866	Lx8	0	+
 29 | chr14	121217593	121217684	RLTR20A4	0	-
 30 | chr13	14527292	14527394	Lx8b	0	+
 31 | chrX	113068169	113068313	B1_Mm	0	-
 32 | chr7	21774699	21774922	RMER19B2	0	+
 33 | chr3	104611578	104611728	B3A	0	-
 34 | chr2	158183914	158183943	B1F1	0	+
 35 | chrX	83091173	83091268	PB1D7	0	+
 36 | chrY	18505375	18507434	L1_Mus3	0	-
 37 | chrY	53460095	53460226	B1_Mus2	0	+
 38 | chr18	56988834	56988941	L3	0	+
 39 | chr15	46551396	46551807	MMERVK10C-int	0	-
 40 | chr18	79506187	79506333	B1_Mm	0	-
 41 | chr2	104648414	104648547	B1_Mur2	0	-
 42 | chr7	109416903	109417032	Lx7	0	+
 43 | chr1	33863431	33863563	ID_B1	0	-
 44 | chr4	148585303	148585574	RLTR19-int	0	-
 45 | chr2	164776167	164776283	B1_Mur2	0	+
 46 | chr2	155889136	155889458	MLTR11B	0	+
 47 | chr1	140608946	140609064	RMER13A2	0	-
 48 | chr11	50474308	50474667	ORR1A2	0	+
 49 | chr3	35549471	35549633	Lx7	0	-
 50 | chr18	20885705	20885850	B1_Mus1	0	+
 51 | chr9	98122822	98123031	URR1B	0	+
 52 | chr5	145787688	145787824	RSINE1	0	+
 53 | chr9	116910264	116910518	B4	0	+
 54 | chr2	118982678	118982802	L1MB8	0	-
 55 | chr1	74231577	74231701	ID_B1	0	-
 56 | chr3	51388265	51388358	PB1D7	0	+
 57 | chr1	78437903	78438016	ID_B1	0	+
 58 | chr1	179450543	179450599	PB1D9	0	+
 59 | chr11	106956412	106956506	B1F	0	-
 60 | chr7	105070982	105071111	B1F	0	+
 61 | chr14	55891766	55891869	B1F2	0	+
 62 | chr3	95002315	95002463	B1_Mm	0	+
 63 | chr14	123443243	123443788	L1_Mus1	0	+
 64 | chr9	84553142	84553311	ID_B1	0	-
 65 | chrX	74054421	74054609	B2_Mm2	0	-
 66 | chr2	50599335	50599996	L1_Mur2	0	+
 67 | chr11	10009054	10009447	RLTR47	0	+
 68 | chr14	14575064	14575178	B2_Mm2	0	-
 69 | chrX	66050795	66051345	L1Md_F2	0	+
 70 | chr4	109302482	109302690	B3	0	+
 71 | chr6	5823803	5823847	MLT1B	0	+
 72 | chr9	94472366	94472513	B1_Mus1	0	-
 73 | chr2	7172981	7173150	Tigger19a	0	+
 74 | chr9	33581540	33581630	B3A	0	+
 75 | chr1	60831307	60832014	L1_Mur3	0	-
 76 | chr2	16821242	16821456	RMER15-int	0	-
 77 | chr7	142943894	142944262	ORR1C2	0	+
 78 | chr12	73440499	73440743	B4	0	-
 79 | chrX	90113268	90113445	B3	0	+
 80 | chr18	20618867	20619808	L1M3e	0	+
 81 | chr9	114718823	114718968	B1_Mm	0	-
 82 | chr11	12670894	12671016	MIR	0	-
 83 | chr13	32387251	32387629	MLT1D	0	+
 84 | chrX	97791970	97792192	URR1A	0	+
 85 | chr13	76374166	76374333	ERVB4_1B-I_MM-int	0	-
 86 | chr5	47907546	47907672	Lx10	0	+
 87 | chr16	8837567	8837715	B1_Mus1	0	-
 88 | chr4	150767884	150768026	B1_Mur4	0	+
 89 | chr10	99491068	99491203	B1_Mur3	0	-
 90 | chr17	90847952	90849043	L1_Mus3	0	+
 91 | chr2	99125206	99126690	L1_Mur3	0	-
 92 | chr13	47581815	47582027	B4A	0	+
 93 | chr6	99194219	99194361	MER117	0	+
 94 | chr14	30096250	30096453	B3	0	+
 95 | chr13	24443830	24443959	PB1D10	0	-
 96 | chr13	28396991	28397036	L1Md_F2	0	+
 97 | chr1	165293822	165294036	B3	0	+
 98 | chr7	17889030	17889430	RMER6BA	0	+
 99 | chr9	25684255	25684348	MLT2D	0	-
100 | chr6	119394571	119394668	PB1D7	0	+
101 | chr19	119394571	119394668	test	0	+
102 | chrM	1193971	1193968	test	0	+
103 | 


--------------------------------------------------------------------------------
/Data/test.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiekaiLab/scTE/566f6ab3baaf76cd006ab965edc08e4576eb73c9/Data/test.bam


--------------------------------------------------------------------------------
/Data/test.exclusive.idx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiekaiLab/scTE/566f6ab3baaf76cd006ab965edc08e4576eb73c9/Data/test.exclusive.idx


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Jiangping He, Andrew P. Hutchins & Jiekai Chen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | scTE
  2 | ==============
  3 | 
  4 | Quantifying transposable element (TEs) expression from single-cell sequencing data
  5 | ----------------------------------------------------------------------
  6 | [![DOI](https://zenodo.org/badge/190696033.svg)](https://zenodo.org/badge/190696033.svg)
  7 | 
  8 | scTE takes as input:
  9 | 
 10 |  * Aligned sequence reads (BAM/SAM format)
 11 |  * The genomic location of TEs (BED format)
 12 |  * The genomic location of genes (GTF format)
 13 | 
 14 | 
 15 | ![scTE workflow](./docs/scTE.png)
 16 | 
 17 | 
 18 | Installation
 19 | ------------
 20 | scTE works with python >=3.6.
 21 | 
 22 | ```bash
 23 | $ git clone https://github.com/JiekaiLab/scTE.git
 24 | $ cd scTE
 25 | $ python setup.py install
 26 | ```
 27 | 
 28 | Usage
 29 | -----
 30 | 
 31 | **Building genome indices**<br>
 32 | scTE builds genome indices for the fast alignment of reads to genes and TEs. These indices can be automatically generated using the commands:
 33 | 
 34 | ```bash
 35 | $ scTE_build -g mm10 # Mouse
 36 | $ scTE_build -g hg38 # Human
 37 | $ scTE_build -g panTro6 # Chimpanzee
 38 | $ scTE_build -g macFas5 # Macaca fascicularis
 39 | $ scTE_build -g dm6 # Drosophila melanogaster
 40 | $ scTE_build -g danRer11 # Zebrafish
 41 | $ scTE_build -g xenTro9 # Xenopus tropicalis
 42 | ```
 43 | 
 44 | These scripts will automatically download the genome annotations, for mouse:
 45 | 
 46 | ```bash
 47 | $ ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M21/gencode.vM21.annotation.gtf.gz
 48 | $ http://hgdownload.soe.ucsc.edu/goldenPath/mm10/database/rmsk.txt.gz
 49 | ```
 50 | 
 51 | Or for human:
 52 | 
 53 | ```bash
 54 | $ ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/gencode.v30.annotation.gtf.gz
 55 | $ http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/rmsk.txt.gz
 56 | ```
 57 | 
 58 | Or for Chimpanzee:
 59 | 
 60 | ```bash
 61 | $ http://ftp.ensembl.org/pub/release-103/gtf/pan_troglodytes/Pan_troglodytes.Pan_tro_3.0.103.gtf.gz
 62 | $ https://hgdownload.soe.ucsc.edu/goldenPath/panTro6/database/rmsk.txt.gz
 63 | ```
 64 | 
 65 | Or for Macaca fascicularis:
 66 | 
 67 | ```bash
 68 | $ http://ftp.ensembl.org/pub/release-102/gtf/macaca_fascicularis/Macaca_fascicularis.Macaca_fascicularis_5.0.102.gtf.gz
 69 | $ http://hgdownload.soe.ucsc.edu/goldenPath/macFas5/database/rmsk.txt.gz
 70 | ```
 71 | 
 72 | Or for Drosophila melanogaster:
 73 | 
 74 | ```bash
 75 | $ http://ftp.ensembl.org/pub/release-103/gtf/drosophila_melanogaster/Drosophila_melanogaster.BDGP6.32.103.gtf.gz
 76 | $ http://hgdownload.soe.ucsc.edu/goldenPath/dm6/database/rmsk.txt.gz
 77 | ```
 78 | 
 79 | Or for Zebrafish:
 80 | 
 81 | ```bash
 82 | $ http://ftp.ensembl.org/pub/release-103/gtf/danio_rerio/Danio_rerio.GRCz11.103.gtf.gz
 83 | $ https://hgdownload.soe.ucsc.edu/goldenPath/danRer11/database/rmsk.txt.gz
 84 | ```
 85 | 
 86 | Or for Xenopus tropicalis:
 87 | 
 88 | ```bash
 89 | $ http://ftp.ensembl.org/pub/release-103/gtf/xenopus_tropicalis/Xenopus_tropicalis.Xenopus_tropicalis_v9.1.103.gtf.gz
 90 | $ https://hgdownload.soe.ucsc.edu/goldenPath/xenTro9/database/rmsk.txt.gz
 91 | ```
 92 | 
 93 | `mm10, hg38, panTro6, macFas5, dm6, danRer11, xenTro9` is the genome assembly version. 
 94 | If you want to use your customs reference, you can use the ` -gene -te` options:
 95 | 
 96 | ```
 97 | scTE_build -te TEs.bed -gene Genes.gtf -o custome
 98 | 
 99 | -te
100 |     Six columns bed file for transposable elements annotation.
101 | -gene
102 |     Gtf file for genes annotation. 
103 | ```
104 | For more informat about BED and GTF format, see from [UCSC](https://genome.ucsc.edu/FAQ/FAQformat).
105 | These annotations are then processed and converted into genome indices. The scTE algorithm will allocate 
106 | reads first to gene exons, and then to TEs by default. Hence TEs inside exon/UTR regions of genes annotated 
107 | in GENCODE will only contribute to the gene, and not to the TE score. This feature can be changed by 
108 | setting `–mode/-m inclusive` in scTE, which will instruct scTE to assign the reads to both TEs and genes 
109 | if a read comes from a TE inside exon/UTR regions of genes. If you want to remove the TEs inside the intron 
110 | of genes, you can sete `–mode/-m nointron` in scTE
111 | 
112 | **Analysis of 10x style scRNA-seq data**
113 | 
114 | scTE makes BAM/SAM file as input, highly recommend to use unfiltered alignment file as input.
115 | 
116 | For `bam` file generated by [STARsolo](https://github.com/alexdobin/STAR) etc, the cell barcodes and UMI need to be integrated into the read 'CR:Z' or 'UR:Z' tage as bellow:
117 | 
118 | ```bash
119 | $ scTE -i inp.bam -o out -x mm10.exclusive.idx --hdf5 True -CB CR -UMI UR
120 | ```
121 | ```bash
122 | $ samtools view test.bam
123 | A00269:12:H7YF2DMXX:2	0	chr10	55902580	255	50M	*	0	0	GTTCTCTCCGTATGTGAGCATGGGAGATACATCCCAGAAAGGCAGAAGGG	FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF	NH:i:1	HI:i:1	AS:i:49	nM:i:0	CR:Z:CTAGAGTGTTTCGCTC	CY:Z:FFFFFFFFFFFFFFFF	UR:Z:TACATGACGC	UY:Z:FFFFFFFFFF
124 | A00269:13:H7YF2DMXX:2	0	chr10	55902784	255	50M	*	0	0	ATAATCTTTGAGATCTCTGGTGAAAATAAGTAGCATAAAGGACAGAATCA	FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF	NH:i:1	HI:i:1	AS:i:49	nM:i:0	CR:Z:CTAGAGTGTTTCGCTC	CY:Z:FFFFFFFFFFFFFFFF	UR:Z:TACATGACGC	UY:Z:FFFFFFFFFF
125 | A00269:14:H7YF2DMXX:2	0	chr13	67837311	255	50M	*	0	0	CTGTTCATTATTTGAGGAAATCAGGACAGGAAATCAAACATGGCAGAATC	FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF	NH:i:1	HI:i:1	AS:i:49	nM:i:0	CR:Z:ATCGAGTGTTTCGCTC	CY:Z:FFFFFFFFFFFFFFFF	UR:Z:TACATGACGC	UY:Z:FFFFFFFFFF
126 | A00269:15:H7YF2DMXX:2	0	chr14	114380523	255	50M	*	0	0	GATCCAGATTAATTGAGACTGTTGATCCTCCTACAGGGTCGCCCTTCTCC	FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF	NH:i:1	HI:i:1	AS:i:49	nM:i:0	CR:Z:CTAGAGTGTTTCGCTC	CY:Z:FFFFFFFFFFFFFFFF	UR:Z:TACATGACGC	UY:Z:FFFFFFFFFF
127 | ```
128 | 
129 | For `bam` file generated by [Cell Ranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) etc, the cell barcodes and UMI need to be integrated into the read 'CB:Z' or 'UB:Z' tage as bellow:
130 | 
131 | ```bash
132 | $ scTE -i inp.bam -o out -x mm10.exclusive.idx --hdf5 True -CB CB -UMI UB
133 | ```
134 | ```bash
135 | $ samtools view test.bam
136 | A00519:758:HTCCHDSXY:3:2535:21296:19774	16	chr1	14021	0	90M	*	0	0	TGGATTTCTATCTCCCTGGCTTGGTGCCAGTTCCTCCAAGTCGATGGCACCTCCCTCCCTCTCAACCACTTGAGCAAACTCCAAGACATC	,FFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:F:FFFFFFFFFFFFFFFFFFF:FFFFF	NH:i:5	HI:i:1	AS:i:88	nM:i:0	RG:Z:SC3_v3_NextGem_DI_CellPlex_Human_PBMC_10K:0:1:HTCCHDSXY:3	RE:A:I	xf:i:0	CR:Z:CTCCCTCCACTGCGAC	CY:Z:FFFFFFFFFFFFFFFF	CB:Z:CTCCCTCCACTGCGAC-1	UR:Z:AAGGCGTAGTAG	UY:Z:FFFFFFFFFFFF	UB:Z:AAGGCGTAGTAG
137 | A00519:758:HTCCHDSXY:1:1355:17237:31720	0	chr1	14260	0	90M	*	0	0	CTCCCTCTCATCCCAGAGAAACAGGTCAGCTGGGAGCTTCTGCCCCCACTGCCTAGGGACCAACAGGGGCAGGAGGCAGTCACTGACCCC	FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF	NH:i:5	HI:i:1	AS:i:88	nM:i:0	RG:Z:SC3_v3_NextGem_DI_CellPlex_Human_PBMC_10K:0:1:HTCCHDSXY:1	RE:A:I	xf:i:0	CR:Z:TCGTCCACAGTATGAA	CY:Z:FFFFFFFFFFFFFFFF	CB:Z:TCGTCCACAGTATGAA-1	UR:Z:GACTTATTTTTT	UY:Z:FFFFFFFFFFFF	UB:Z:GACTTATTTTTT
138 | A00519:758:HTCCHDSXY:3:2227:16703:32080	16	chr1	14411	1	90M	*	0	0	TCAGTTCTTTATTGATTGGTGTGCCGTTTTCTCTGGAAGCCTCTTAAGAACACAGTGGCGCAGGCTGGGTGGAGCCGTCCCCCCATGGAG	FFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFF:FFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF	NH:i:3	HI:i:1	AS:i:88	nM:i:0	RG:Z:SC3_v3_NextGem_DI_CellPlex_Human_PBMC_10K:0:1:HTCCHDSXY:3	RE:A:I	xf:i:0	CR:Z:TTGAGTGGTTGTGGCC	CY:Z:FFFFFFFFFFFFFFFF	CB:Z:TTGAGTGGTTGTGGCC-1	UR:Z:TATAATGCTCAG	UY:Z:FFFFFFFFFFFF	UB:Z:TATAATGCTCAG
139 | A00519:758:HTCCHDSXY:3:2563:23665:33802	16	chr1	14411	1	90M	*	0	0	TCAGTTCTTTATTGATTGGTGTGCCGTTTTCTCTGGAAGCCTCTTAAGAACACAGTGGCGCAGGCTGGGTGGAGCCGTCCCCCCATGGAG	FFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF	NH:i:3	HI:i:1	AS:i:88	nM:i:0	RG:Z:SC3_v3_NextGem_DI_CellPlex_Human_PBMC_10K:0:1:HTCCHDSXY:3	RE:A:I	xf:i:0	CR:Z:TGTTGAGAGGCAATGC	CY:Z:FFFFFFFFFFFFFFFF	CB:Z:TGTTGAGAGGCAATGC-1	UR:Z:ACGGGTGTGGAG	UY:Z:FFFFFFFFFFFF	UB:Z:ACGGGTGTGGAG
140 | ```
141 | ```
142 | -i
143 |     Input file: BAM/SAM file from CellRanger or STARsolo
144 | -o
145 |     Output file prefix
146 | -x
147 |     The filename of the index for the reference genome annotation generated by scTE_build
148 | -p
149 |     Number of threads to use, Default: 1. scTE takes ~10Gb memory each thread for human and mouse genome.
150 | --hdf5
151 |     Save the output as .h5ad formatted file instead of csv file. Default: False
152 | ```
153 | 
154 | scTE is most tuned to [STARsolo](https://github.com/alexdobin/STAR) or the [Cell Ranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) pipeline outputs, 
155 | and can accept BAM files produced by either of these two programs. 
156 | For other aligners, the barcode should be stored in the `CR:Z` or `CB:Z` tag, and the UMI in the `UR:Z` or `UB:Z` tag in the BAM file
157 | 
158 | **Analysis of C1 style scRNA-seq data**<br>
159 | If the UMI is missing or not used in the scRNA-seq technology (for example on the Fluidigm C1 platform), it can be disabled with `–UMI False` 
160 | (the default is True) switch in scTE. If the barcode is missing it can be disabled with the `–CB False` (the default is True), 
161 | and instead the cell barcodes will be taken from the names of the BAM files.
162 | 
163 | ```bash
164 | $ scTE -i inp.bam -o out -x mm10.exclusive.idx -CB False -UMI False
165 | ```
166 | multiple BAM files can be provided to scTE with the `–i` option
167 | ```
168 | $ scTE -i *.bam -o out -x mm10.exclusive.idx -CB False -UMI False
169 | ```
170 | or 
171 | ```
172 | $ scTE -i input1.bam,input2.bam,... -o out -x mm10.exclusive.idx -CB False -UMI False
173 | ```
174 | 
175 | **Analysis of scATAC-seq data**<br>
176 | The genome indices were prebuilt using:
177 | ```
178 | $ wget -c http://hgdownload.soe.ucsc.edu/goldenPath/mm10/database/rmsk.txt.gz -O mm10.te.txt.gz
179 | $ zcat mm10.te.txt.gz | grep -E 'LINE|SINE|LTR|Retroposon' | cut -f6-8,11 >mm10.te.bed
180 | $ scTEATAC_build -g mm10.te.bed -o mm10.te.atac
181 | ```
182 | Then the bam file can processe using scTE with the command:
183 | ```
184 | scTEATAC -i input.bam -x mm10.te.atac.idx
185 | ```
186 | 
187 | **Citation**<br>
188 | If scTE is useful for your research, consider citing [Nature Communications (2021)](https://www.nature.com/articles/s41467-021-21808-x)
189 | 
190 | 
191 | 


--------------------------------------------------------------------------------
/bin/scTE:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import pandas as pd
  3 | import multiprocessing
  4 | from functools import partial
  5 | import logging
  6 | import os, sys, glob, datetime, time, gzip
  7 | import argparse
  8 | import collections
  9 | from math import log
 10 | sys.path.append(os.path.join(os.path.split(sys.argv[0])[0], '../'))
 11 | from scTE.miniglbase import genelist, glload, location
 12 | from scTE.annotation import annoGtf
 13 | from scTE.base import *
 14 | 
 15 | def prepare_parser():
 16 |     desc = "hahaha..."
 17 | 
 18 |     exmp = "Example: scTE <-i scRNA.sorted.bam> <-o out> [--min_genes 200] [--min_counts 400] [-p 4] <-x mm10.exclusive.idx>"
 19 | 
 20 |     parser = argparse.ArgumentParser(prog='scTE',description=desc, epilog=exmp)
 21 | 
 22 |     optional = parser._action_groups.pop()
 23 | 
 24 |     optional.add_argument('--min_genes', dest='genenumber',metavar='INT', type=int,default=200,
 25 |                         help='Minimum number of genes expressed required for a cell to pass filtering. Default: 200')
 26 | 
 27 |     optional.add_argument('--min_counts', dest='countnumber',metavar='INT', type=int,
 28 |                         help='Minimum number of counts required for a cell to pass filtering. Default: 2*min_genes')
 29 | 
 30 |     optional.add_argument('--expect-cells', dest='cellnumber',metavar='INT', type=int,  default=10000,
 31 |                         help='Expected number of cells. Default: 10000')
 32 | 
 33 |     optional.add_argument('-f','--format', metavar='input file format', dest='format', type=str, nargs='?', default='BAM', choices=['BAM','SAM'],
 34 |                         help='Input file format: BAM or SAM. DEFAULT: BAM')
 35 | 
 36 |     optional.add_argument('-CB', dest='CB', type=str, nargs='?', default='CR', choices=['CR','CB','False'],
 37 |                         help='Set to false to ignore for cell barcodes, it is useful for SMART-seq. If you set CB=False, it also will set UMI=False by default, Default: CR')
 38 | 
 39 |     optional.add_argument('-UMI', dest='UMI', type=str, nargs='?', default='UR', choices=['UR','UB','False'],
 40 |                         help='Set to false to ignore for UMI, it is useful for SMART-seq. Default: True')
 41 | 
 42 |     optional.add_argument('--keeptmp', dest='keeptmp', type=str, nargs='?', default='False', choices=['True','False'],
 43 |                         help='Keep the _scTEtmp file, which is useful for debugging. Default: False')
 44 | 
 45 |     optional.add_argument('--hdf5', dest='hdf5', type=str, nargs='?', default='False', choices=['True','False'],
 46 |                         help='Save the output as .h5ad formatted file instead of csv file. Default: False')
 47 | 
 48 |     optional.add_argument('-p','--thread', metavar='INT', dest='thread', type=int, default=1,
 49 |                         help='Number of threads to use, Default: 1')
 50 | 
 51 |     optional.add_argument('-v','--version', action='version', version='%(prog)s 1.0')
 52 | 
 53 |     required = parser.add_argument_group('required arguments')
 54 | 
 55 |     required.add_argument('-i','--input', dest='input', type=str, nargs='+', required=True,
 56 |                         help='Input file: BAM/SAM file from CellRanger or STARsolo, the file must be sorted by chromosome position')
 57 | 
 58 |     required.add_argument('-x', dest='annoglb',nargs='+', required=True,
 59 |                         help='The filename of the index for the reference genome annotation.')
 60 | 
 61 | #     required.add_argument('-g','--genome', metavar='genome', dest='genome', type=str, nargs='?', default='mm10', choices=['hg38','mm10',], required=True,
 62 | #                         help='"hg38" for human, "mm10" for mouse')
 63 | 
 64 |     required.add_argument('-o','--out', dest='out', nargs='?', required=True, help='Output file prefix')
 65 | 
 66 |     parser._action_groups.append(optional)
 67 |     optional = parser.add_argument_group('optional arguments')
 68 |     optional
 69 | 
 70 |     return parser
 71 | 
 72 | def main():
 73 |     """Start scTEs......parse options......"""
 74 | 
 75 |     timestart=datetime.datetime.now()
 76 |     args=read_opts(prepare_parser())
 77 | 
 78 |     # Fix up the UMI/CB booleans:
 79 | #     if args.UMI == 'True': args.UMI = True
 80 | #     else: args.UMI = False
 81 | #     if args.CB == 'True': args.CB = True
 82 | #     else: args.CB = False
 83 |     if args.hdf5 == 'True': args.hdf5 = True
 84 |     else: args.hdf5 = False
 85 | 
 86 |     info = args.info
 87 |     error = args.error
 88 | 
 89 |     assert sys.version_info >= (3, 6), 'Python >=3.6 is required'
 90 | 
 91 |     info(args.argtxt + "\n")
 92 | 
 93 |     outname = args.out.split('/')[-1:][0]
 94 | 
 95 |     info("Loading the genome annotation index... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
 96 |     allelement, chr_list, all_annot, glannot = Readanno(filename=outname, annoglb=args.annoglb[0]) #genome=args.genome
 97 |     print(sorted(chr_list))
 98 |     info("Finished loading the genome annotation index... %s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
 99 | 
100 |     info("Processing BAM/SAM files ...%s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
101 | 
102 |     if len(args.input) == 1 and ',' in args.input[0]:
103 |         args.input=args.input[0].split(',')
104 | 
105 |     if not os.path.exists('%s_scTEtmp/o1'%outname):
106 |         os.system('mkdir -p %s_scTEtmp/o1'%outname)
107 | 
108 |     for k in args.input:
109 |         checkCBUMI(filename=k,out=outname,CB=args.CB,UMI=args.UMI)
110 |     info("Input SAM/BAM file appears to be valid")
111 | 
112 |     if len(args.input) > 1:
113 |         info('Using parabam2bed as more than 1 input BAM')
114 |         pool=multiprocessing.Pool(processes=args.thread)
115 |         partial_work = partial(Para_bam2bed, CB=args.CB, UMI=args.UMI,out=outname)
116 |         pool.map(partial_work, args.input)
117 |         os.system('gunzip -c -f %s_scTEtmp/o0/*.bed.gz | gzip > %s_scTEtmp/o1/%s.bed.gz' % (outname,outname,outname))
118 |     
119 |     else:
120 |         print(args.CB,args.UMI,'good\n')
121 |         Bam2bed(args.input[0], args.CB, args.UMI, outname, args.thread)
122 |     info("Done BAM/SAM files processing ...%s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
123 | 
124 |     info("Splitting ...%s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
125 |     if args.thread == 1: #Single thread path, mainly
126 |         # This is useful for testing optimsations, as the multiprocessing path the profile
127 |         # Just gets locked up in {method 'acquire' of '_thread.lock' objects}
128 |         info('Executing single thread path')
129 |         whitelist = splitAllChrs(chr_list, filename=outname, genenumber=args.genenumber, countnumber=args.countnumber, UMI=args.UMI)
130 |     else:
131 |         info('Executing multiple thread path with %s threads' % args.thread)
132 |         pool=multiprocessing.Pool(processes=args.thread)
133 |         partial_work = partial(splitChr, filename=outname, CB=args.CB, UMI=args.UMI)
134 |         pool.map(partial_work, chr_list)
135 |         whitelist = filterCRs(filename=outname, genenumber=args.genenumber, countnumber=args.countnumber)
136 | 
137 |     info("Finished processing sample files %s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
138 | 
139 |     info("Fetching from the annotation index... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
140 |     if args.thread == 1: #Single thread path
141 |         for chrom in chr_list:
142 |             align(chr=chrom, filename=outname, all_annot=None, glannot=glannot, whitelist=whitelist) #CB=args.CB
143 | 
144 |     else: # Multiprocessing path:
145 |         pool = multiprocessing.Pool(processes=args.thread)
146 |         partial_work = partial(align, filename=outname, all_annot=all_annot, glannot=None, whitelist=whitelist) # send a copy of the index,  CB=args.CB
147 |         pool.map(partial_work, chr_list)
148 | 
149 |     if not os.path.exists('%s_scTEtmp/o4'%outname):
150 |         os.system('mkdir -p %s_scTEtmp/o4'%outname)
151 |     os.system('gunzip -c -f %s_scTEtmp/o3/%s.*.bed.gz | gzip > %s_scTEtmp/o4/%s.bed.gz' % (outname,outname,outname,outname))
152 |     info("Done fetching... %s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
153 | 
154 |     info("Calculating expression... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
155 |     len_res, genenumber, filename = Countexpression(filename=args.out, allelement=allelement, genenumber=args.genenumber, cellnumber=args.cellnumber, hdf5=args.hdf5)
156 |     if args.hdf5 == True:
157 |         info('Detect {0} cells expressed at least {1} genes, results output to {2}.h5ad'.format(len_res, genenumber, filename))
158 |     else:
159 |         info('Detect {0} cells expressed at least {1} genes, results output to {2}.csv'.format(len_res, genenumber, filename))
160 |     
161 |     info("Finished calculating expression %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
162 | 
163 |     if args.keeptmp == 'True':
164 |         pass
165 |     else:
166 |         os.system('rm -rf %s_scTEtmp'%outname)
167 | 
168 |     timeend = datetime.datetime.now()
169 |     info("Done with %s\n" % timediff(timestart,timeend))
170 | 
171 | if __name__ == '__main__':
172 |     try:
173 |         main()
174 |     except KeyboardInterrupt:
175 |         sys.stderr.write("User interrupt !\n")
176 |         sys.exit(0)
177 | 
178 | 
179 | 


--------------------------------------------------------------------------------
/bin/scTEATAC:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | desc = '''
  3 | 
  4 | The scATAC-seq data comes as three files, P1, P2 and the barcode, and there is no UMI
  5 | 
  6 | You can just align P1 and P2 with your favourite aligner (we prefer STAR with these settings):
  7 | 
  8 | ****
  9 | teopts=' --outFilterMultimapNmax 100 --winAnchorMultimapNmax 100 --outSAMmultNmax 1 --outSAMtype BAM SortedByCoordinate --twopassMode Basic --outWigType wiggle --outWigNorm RPM'
 10 | opts='--runRNGseed 42 --runThreadN 12 --readFilesCommand zcat '
 11 | 
 12 | genome_mm10='--genomeDir mm10_gencode_vM21_starsolo/SAindex'
 13 | genome_hg38='--genomeDir hg38_gencode_v30_starsolo/SAindex'
 14 | 
 15 | # p1 = read
 16 | # p2 = barcode and UMI
 17 | # Make sure you set the correct genome index;
 18 | STAR $opts $teopts $genome_hg38 --outFileNamePrefix ss.${out} --readFilesIn ${p1} ${p2}
 19 | ****
 20 | 
 21 | This script will then reprocess the BAM file, and put the BARCODE into CR SAM tag and spoof a UMI
 22 | 
 23 | The UMI is generated by incrementing the sequence, so, each UMI is up to 4^14 (26 million).
 24 | I guess there remains a change of a clash, but it should be so rare
 25 | as to be basically impossible.
 26 | 
 27 | Keep in mind though that downstream UMI statistics are inaccurate
 28 | 
 29 | Require pysam
 30 | 
 31 | '''
 32 | import sys, os , time
 33 | import gzip
 34 | import argparse
 35 | import logging
 36 | try:
 37 |     import pysam
 38 | except ImportError:
 39 |     print('pack_scatacseq requires pysam')
 40 |     sys.quit()
 41 | 
 42 | sys.path.append(os.path.join(os.path.split(sys.argv[0])[0], '../'))
 43 | # from scTE.scatacseq import build_barcode_dict, parse_bam, load_expected_whitelist
 44 | from scTE.scatacseq import atacBam2bed,para_atacBam2bed
 45 | from scTE.base import *
 46 | 
 47 | # Command-line options;
 48 | def prepare_parser():
 49 |     exmp = 'scTEATAC -i input.bam -o out --genome mm10 -x mm10.te.idx'
 50 | 
 51 |     description = 'Package the BAM and BARCODE for the scATAC-seq data to make it suitable for scTE main pipeline'
 52 | 
 53 |     description = 'dummy'
 54 | 
 55 |     parser = argparse.ArgumentParser(prog='scTE_scatacseq', description=description, epilog=exmp)
 56 |     # Optional:
 57 |     optional = parser._action_groups.pop()
 58 | #     optional.add_argument('-e', '--expwhite', nargs=1, required=False, help='A txt file containing the expected whitelist of barcodes to correct the observed barcodes with')
 59 |     optional.add_argument('--ondisk', action='store_true', required=False, help='Do everything in memory (faster, but you will need a lot!, or do it on disk (slower, but no memory requirement')
 60 | 
 61 |     optional.add_argument('--min_counts', dest='countnumber',metavar='INT', type=int, default=1000,
 62 |                         help='Minimum number of counts required for a cell to pass filtering. Default: 2*min_genes')
 63 | 
 64 |     optional.add_argument('-CB', dest='CB', type=str, nargs='?', default='False', choices=['True','False'],
 65 |                         help='Set to false to ignore for cell barcodes, Default: False')
 66 | 
 67 |     optional.add_argument('-UMI', dest='UMI', type=str, nargs='?', default='False', choices=['True','False'],
 68 |                         help='Set to false to ignore for UMI. Default: False')
 69 | 
 70 |     optional.add_argument('--ignoreDuplicates', dest='noDup', type=str, nargs='?', default='True', choices=['True','False'],
 71 |                         help='If set, reads that have the same orientation and start position will be considered only once. If reads are paired, the mate’s position also has to coincide to ignore a read. Default: True')
 72 | 
 73 |     optional.add_argument('--keeptmp', dest='keeptmp', type=str, nargs='?', default='False', choices=['True','False'],
 74 |                         help='Keep the _scTEtmp file, which is useful for debugging. Default: False')
 75 | 
 76 |     optional.add_argument('-p','--thread', metavar='INT', dest='thread', type=int, default=1,
 77 |                         help='Number of threads to use, Default: 1')
 78 |     
 79 |     optional.add_argument('--hdf5', dest='hdf5', type=str, nargs='?', default='False', choices=['True','False'],
 80 |                         help='Save the output as .h5ad formatted file instead of csv file. Default: False')
 81 |                         
 82 |     required = parser.add_argument_group('required arguments')
 83 | 
 84 |     required.add_argument('-i','--input', dest='input', type=str, nargs='+', required=True,
 85 |                         help='Input file: BAM/SAM file')
 86 |                         
 87 | #     required.add_argument('-o', '--out', nargs=1, required=True, help='the output filename prefix')
 88 |     required.add_argument('-o','--out', dest='out', nargs='?', required=True, help='Output file prefix')
 89 | 
 90 |     required.add_argument('-x', dest='annoglb',nargs='+', required=True,
 91 |                     help='The filename of the indexed genome')
 92 | 
 93 | #     required.add_argument('-g','--genome', metavar='genome', dest='genome', type=str, nargs='?', default='mm10', choices=['hg38','mm10',], required=True,
 94 | #                         help='"hg38" for human, "mm10" for mouse')
 95 | 
 96 |     
 97 | #     required.add_argument('-f', '--infastq', nargs=1, required=True, help='THe FASTQ file containing the barcode read')
 98 | #     required.add_argument('-o', '--outbam', nargs=1, required=True, help='the BAM alignment file to save the result into')
 99 | #     required.add_argument('-w', '--obswhite', nargs=1, required=True, help='A txt file to save the observed barcode whitelist to')
100 | 
101 |     parser._action_groups.append(optional)
102 | 
103 |     logging.basicConfig(level=logging.DEBUG,
104 |                     format='%(levelname)-8s: %(message)s',
105 |                     datefmt='%m-%d %H:%M')
106 | 
107 |     parser.log = logging.getLogger('scTE_scatacseq')
108 | 
109 |     return parser
110 | 
111 | def main():
112 |     assert sys.version_info >= (3, 6), 'Python >=3.6 is required'
113 | 
114 |     timestart=datetime.datetime.now()
115 |     
116 | #     args=read_opts(prepare_parser())
117 |     parser = prepare_parser()
118 |     args = parser.parse_args()
119 |     info = logging.info
120 | 
121 |     logger = parser.log
122 |     
123 |     if args.CB == 'True': args.CB = True
124 |     else: args.CB = False
125 |     if args.hdf5 == 'True': args.hdf5 = True
126 |     else: args.hdf5 = False
127 |     if args.noDup == 'True': args.noDup = True
128 |     else: args.noDup = False
129 |     if args.UMI == 'True': args.UMI = True
130 |     else: args.UMI = False
131 |     
132 |     args.genenumber = 0
133 |     args.cellnumber = 1e4
134 | 
135 |     logger.info('Arguments:')
136 |     logger.info('out: %s' % args.out)
137 |     logger.info('index: %s \n' % args.annoglb[0])
138 |     logger.info("Minimum number of counts required = %s"% args.countnumber)
139 |     logger.info("Number of threads = %s " % args.thread)
140 | 
141 |     outname = args.out.split('/')[-1:][0]
142 |     
143 |     info("Loading the genome annotation index... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
144 |     allelement, chr_list, all_annot, glannot = Readanno(filename=outname, annoglb=args.annoglb[0])
145 |     chr_list = [ k for k in chr_list if k not in ['chrM']]
146 |     info("Finished loading the genome annotation index... %s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
147 | 
148 |     info("Processing BAM/SAM files ...%s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
149 | 
150 |     if len(args.input) == 1 and ',' in args.input[0]:
151 |         args.input=args.input[0].split(',')
152 |     
153 |     if not os.path.exists('%s_scTEtmp/o1'%outname):
154 |         os.system('mkdir -p %s_scTEtmp/o1'%outname)
155 | 
156 |     if len(args.input) > 1:
157 |         info('Using para_atacBam2bed as more than 1 input BAM')
158 |         pool=multiprocessing.Pool(processes=args.thread)
159 |         partial_work = partial(para_atacBam2bed, CB=args.CB,out=outname, noDup=args.noDup)
160 |         pool.map(partial_work, args.input)
161 | 
162 |         os.system('gunzip -c -f %s_scTEtmp/o0/*.bed.gz | gzip > %s_scTEtmp/o1/%s.bed.gz' % (outname,outname,outname))
163 |     else:
164 |         atacBam2bed(args.input[0], outname, CB=args.CB, UMI=args.UMI, noDup=args.noDup, num_threads=args.thread)
165 |     info("Done BAM/SAM files processing ...%s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
166 |     
167 |     info("Splitting ...%s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
168 |     if args.thread == 1: #Single thread path, mainly
169 |         # This is useful for testing optimsations, as the multiprocessing path the profile
170 |         # Just gets locked up in {method 'acquire' of '_thread.lock' objects}
171 |         info('Executing single thread path')
172 |         whitelist = splitAllChrs(chr_list, filename=outname, genenumber=args.genenumber, countnumber=args.countnumber, UMI=args.UMI)
173 |     else:
174 |         info('Executing multiple thread path with %s threads' % args.thread)
175 |         pool=multiprocessing.Pool(processes=args.thread)
176 |         partial_work = partial(splitChr, filename=outname, CB=args.CB, UMI=args.UMI)
177 |         pool.map(partial_work, chr_list)
178 |         whitelist = filterCRs(filename=outname, genenumber=args.genenumber, countnumber=args.countnumber)
179 | 
180 |     info("Finished processing sample files %s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
181 |     
182 |     info("Fetching from the annotation index... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
183 |     if args.thread == 1: #Single thread path
184 |         for chrom in chr_list:
185 |             align(chr=chrom, filename=outname, all_annot=None, glannot=glannot, whitelist=whitelist) #, CB=args.CB
186 | 
187 |     else: # Multiprocessing path:
188 |         pool = multiprocessing.Pool(processes=args.thread)
189 |         partial_work = partial(align, filename=outname, all_annot=all_annot, glannot=None, whitelist=whitelist ) # send a copy of the index , CB=args.CB
190 |         pool.map(partial_work, chr_list)
191 | 
192 |     if not os.path.exists('%s_scTEtmp/o4'%outname):
193 |         os.system('mkdir -p %s_scTEtmp/o4'%outname)
194 |     os.system('gunzip -c -f %s_scTEtmp/o3/%s.*.bed.gz | gzip > %s_scTEtmp/o4/%s.bed.gz' % (outname,outname,outname,outname))
195 |     info("Done fetching... %s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
196 | 
197 |     info("Calculating expression... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
198 |     len_res, genenumber, filename = Countexpression(filename=args.out, allelement=allelement, genenumber=args.genenumber, cellnumber=args.cellnumber,hdf5=args.hdf5)
199 |     info('Detect {0} cells expressed at least {1} genes, results output to {2}.csv'.format(len_res, genenumber, filename))
200 |     info("Finished calculating expression %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
201 | 
202 |     if args.keeptmp == 'True':
203 |         pass
204 |     else:
205 |         os.system('rm -rf %s_scTEtmp'%outname)
206 | 
207 |     timeend = datetime.datetime.now()
208 |     info("Done with %s\n" % timediff(timestart,timeend))
209 | 
210 | 
211 |     if args.ondisk: # Cleanup the DB
212 |         os.remove(tmpfilename)
213 | 
214 | if __name__ == '__main__':
215 |     try:
216 |         main()
217 |     except KeyboardInterrupt:
218 |         sys.stderr.write("User interrupt\n")
219 |         sys.exit(0)
220 | 


--------------------------------------------------------------------------------
/bin/scTEATAC_build:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import multiprocessing
 4 | from functools import partial
 5 | import logging
 6 | import os, sys, glob, datetime, time, gzip
 7 | import argparse
 8 | import collections
 9 | from math import log
10 | sys.path.append(os.path.join(os.path.split(sys.argv[0])[0], '../'))
11 | from scTE.miniglbase import genelist, glload, location
12 | 
13 | chr_list = [ str(k) for k in  list(range(1,50))] + ['X','Y', 'M']
14 | 
15 | def read_opts(parser):
16 |     args = parser.parse_args()
17 | 
18 | #     if args.mode not in ['inclusive', 'exclusive'] :
19 | #         logging.error("Counting mode %s not supported\n" % (args.mode))
20 | #         parser.print_help()
21 | #         sys.exit(1)
22 | # 
23 | #     if args.genome not in ['mm10', 'hg38'] :
24 | #         logging.error("Counting mode %s not supported\n" % (args.genome))
25 | #         parser.print_help()
26 | #         sys.exit(1)
27 | 
28 |     args.info = logging.info
29 |     return args
30 | 
31 | def genomeIndex(genome,outname):
32 | 
33 | 
34 |     form={'force_tsv':True, 'loc': 'location(chr=column[0], left=column[1], right=column[2])', 'annot': 3}
35 |     if genome.endswith('.gz'):
36 |         genome = genelist(genome, format=form, gzip=True)
37 |     else:
38 |         genome = genelist(genome, format=form)
39 |         
40 |     genome.save('%s.idx'%outname)
41 | 
42 | def prepare_parser():
43 |     
44 |     desc = "Build genome annotation index for scTE"
45 | 
46 |     exmp = "Example: scTEATAC_build -g Data/TE.bed -o mm10.te"
47 |     
48 |     parser = argparse.ArgumentParser(prog='scTE_build',description=desc, epilog=exmp)
49 |     
50 |     optional = parser._action_groups.pop()
51 |                             
52 |     optional.add_argument('-g','--genome', metavar='genome', dest='genome',type=str, nargs='?', required=True,
53 |                         help='Bed file of the genome window')
54 |     
55 |     optional.add_argument('-o','--out', dest='out', nargs='?', help='Output file prefix, Default: the genome name')
56 |     
57 |     required = parser.add_argument_group('required arguments')
58 |     
59 |     parser._action_groups.append(optional)
60 |     optional = parser.add_argument_group('optional arguments')
61 |     optional
62 |     
63 |     return parser
64 | 
65 | def main():
66 | 
67 |     timestart=datetime.datetime.now()
68 |     args=read_opts(prepare_parser())
69 | 
70 |     assert sys.version_info >= (3, 6), 'Python >=3.6 is required'
71 | 
72 |     info = args.info
73 | 
74 |     info("Building the scTE genome annotation index... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
75 |     
76 |     genomefile=args.genome
77 |     genomeIndex(args.genome,args.out)
78 |     
79 |     info("Done genome annotation index building... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
80 | 
81 | if __name__ == '__main__':
82 |     try:
83 |         main()
84 |     except KeyboardInterrupt:
85 |         sys.stderr.write("User interrupt !\n")
86 |         sys.exit(0)
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/bin/scTE_build:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import multiprocessing
  4 | from functools import partial
  5 | import logging
  6 | import os, sys, glob, datetime, time, gzip
  7 | import argparse
  8 | import collections
  9 | from math import log
 10 | import numpy as np
 11 | sys.path.append(os.path.join(os.path.split(sys.argv[0])[0], '../'))
 12 | from scTE.miniglbase import genelist, glload, location
 13 | 
 14 | chr_list = [ str(k) for k in  list(range(1,50))] + ['X','Y', 'M']
 15 | 
 16 | def read_opts(parser):
 17 |     args = parser.parse_args()
 18 | 
 19 |     if args.mode not in ['inclusive', 'exclusive', 'nointron'] :
 20 |         logging.error("Counting mode %s not supported\n" % (args.mode))
 21 |         parser.print_help()
 22 |         sys.exit(1)
 23 | 
 24 |     if args.genome not in ['mm10','hg38','panTro6','macFas5','dm6','danRer11','xenTro9','other'] :
 25 |         logging.error("Counting genome %s not supported\n" % (args.genome))
 26 |         parser.print_help()
 27 |         sys.exit(1)
 28 | 
 29 |     args.info = logging.info
 30 |     return args
 31 | 
 32 | def cleanexon(exons):
 33 |     tmp = []
 34 |     for k in sorted(exons):
 35 |         E=[]
 36 |         for it in exons[k]:
 37 |             E+=list(range(it[1],it[2]))
 38 |         E=sorted(set(E))
 39 | 
 40 |         s=0
 41 |         #tmp=[]
 42 |         for id in range(0,len(E)-1):
 43 |             if E[id+1]-E[id] >1:
 44 |                 en=id
 45 |                 tmp.append({'loc': location(chr=it[0], left=E[s], right=E[en]), 'annot': k}) 
 46 |                 s=en+1
 47 |         tmp.append({'loc': location(chr=it[0], left=E[s], right=E[id+1]), 'annot': k})
 48 | 
 49 |     return tmp
 50 | 
 51 | def readGtf(filename):
 52 |     raw = {}
 53 |     clean = {}
 54 |     if '.gz' in filename:
 55 |         o = gzip.open(filename,'rb')
 56 |     else:
 57 |         o = open(filename,'r')
 58 | 
 59 |     for idx, l in enumerate(o):
 60 |         if '.gz' in filename:
 61 |             l=l.decode('ascii')
 62 |         if l.startswith('#'):
 63 |             continue
 64 |         t=l.strip().split('\t')
 65 |         if t[2]=='exon' or t[2]=='UTR':
 66 |             if 'chr' not in t[0]:
 67 |                 chr = 'chr' + t[0]
 68 |             chr = t[0]
 69 |             if chr.replace('chr','') not in chr_list:
 70 |                 continue
 71 |             left = int(t[3])
 72 |             riht =  int(t[4])
 73 |             
 74 |             if 'gene_name' not in t[8]:
 75 |                 continue
 76 |             
 77 |             name=t[8].split('gene_name "')[1].split('";')[0]
 78 | 
 79 |             if name not in raw:
 80 |                 raw[name] = []
 81 |             raw[name].append([chr,left,riht])
 82 | 
 83 |             if 'protein_coding' not in l and 'lincRNA' not in l:
 84 |                 continue
 85 |             if name not in clean:
 86 |                 clean[name] = []
 87 |             clean[name].append([chr,left,riht])
 88 |     o.close()
 89 |     
 90 |     return raw, clean
 91 | 
 92 | 
 93 | def genomeIndex(genome, mode, tefile, genefile, outname, geneurls, teurls):
 94 | 
 95 |     if not genefile: #Download twice for double check, as sometines wget may stops on the way
 96 |         os.system('wget -c -t 0 -T 5 %s'%geneurls)
 97 |         os.system('wget -c -t 0 -T 5 %s'%geneurls)
 98 |         genefilename = geneurls.split('/')[-1:][0]
 99 |     else:
100 |         genefilename = genefile
101 |     
102 |     a = readGtf(genefilename)
103 |     
104 |     raw = cleanexon(a[0]) 
105 |     clean = cleanexon(a[1])
106 | 
107 |     
108 |     # for costume chromsome
109 |     if tefile:
110 |         o = open(tefile,'rU')
111 |         for line in o:
112 |             chr = line.strip().split('\t')[0]
113 |             if chr not in chr_list:
114 |                 chr_list.append(chr)
115 |         o.close()
116 |     #======================
117 |     
118 |     if not tefile:
119 |         os.system('wget -c -t 0 -T 5 %s'%teurls)
120 |         os.system('wget -c -t 0 -T 5 %s'%teurls)
121 |         tefilename = teurls.split('/')[-1:][0]
122 |         teform ={'force_tsv': True, 'loc': 'location(chr=column[5], left=column[6], right=column[7])', 'annot': 10}
123 |     else:
124 |         tefilename = tefile
125 |     
126 |     gls = genelist()
127 |     gls.load_list(clean)
128 |     
129 |     if mode == 'exclusive':
130 |         gene = {}
131 |         for l in clean:
132 |             chr = l['loc'].loc['chr'] 
133 |             if chr not in chr_list:
134 |                 continue
135 |             left = l['loc']['left']
136 |             rite = l['loc']['right']
137 | 
138 |             left_buck = ((left-1)//10000) * 10000
139 |             right_buck = (rite//10000) * 10000
140 |             buckets_reqd = range(left_buck, right_buck+10000, 10000)
141 | 
142 |             if chr not in gene:
143 |                 gene[chr] = {}
144 | 
145 |             if buckets_reqd:
146 |                 for buck in buckets_reqd:
147 |                     if buck not in gene[chr]:
148 |                         gene[chr][buck] = []
149 |                     gene[chr][buck].append([left, rite])
150 | 
151 |         # Process the TEs:
152 |         noverlap = []
153 |         if '.gz' in tefilename:
154 |             o = gzip.open(tefilename,'rb')
155 |         else:
156 |             o = open(tefilename,'rU')
157 | 
158 |         for n, l in enumerate(o):
159 |             if '.gz' in tefilename:
160 |                 l = l.decode('ascii')
161 |             t = l.strip().split('\t')
162 |             
163 |             if not tefile:
164 |                 chr = t[5].replace('chr', '')
165 |                 left = int(t[6])
166 |                 rite = int(t[7])
167 |                 name = t[10]
168 |                 clas=t[11]
169 |                 if clas not in ['DNA','LINE','LTR','SINE','Satellite','Retroposon']:
170 |                     continue
171 |             else:
172 |                 chr = t[0].replace('chr', '')
173 |                 left = int(t[1])
174 |                 rite = int(t[2])
175 |                 name = t[3]
176 |                 
177 |             if chr not in chr_list:
178 |                 continue
179 |             if chr not in gene: # Should be very rare
180 |                 noverlap.append({'loc': location(chr=chr, left=left, right=rite), 'annot': name})
181 |                 continue
182 |             
183 |             left_buck = ((left-1)//10000) * 10000
184 |             right_buck = (rite//10000) * 10000
185 |             buckets_reqd = range(left_buck, right_buck+10000, 10000)
186 | 
187 |             if buckets_reqd:
188 |                 i = 1
189 |                 for buck in buckets_reqd:
190 |                     if buck not in gene[chr]:
191 |                         pass
192 |                     else:
193 |                         for k in gene[chr][buck]:
194 |                             if left < k[1] and rite > k[0]:
195 |                                 i = 0
196 |                                 break
197 |                         if i == 0: # already found an overlap, so quit out;
198 |                             break
199 |                 if i == 1:
200 |                     noverlap.append({'loc': location(chr=chr, left=left, right=rite), 'annot': name})
201 |         
202 |         TEs = genelist()
203 |         TEs.load_list(noverlap)
204 |         
205 |         genes = genelist() 
206 |         genes.load_list(raw)
207 | 
208 |         all_annot = genes + TEs
209 |         
210 |         if not outname:
211 |             all_annot.save('%s.exclusive.idx'%genome)
212 |             print('Done the index building, results output to %s.exclusive.idx \n'% genome)
213 |         else:
214 |             all_annot.save('%s.exclusive.idx'%outname)
215 |             print('Done the index building, results output to %s.exclusive.idx \n'% outname)
216 | 
217 |     elif mode == 'inclusive':
218 |         genes = genelist() 
219 |         genes.load_list(raw)
220 |         
221 |         
222 |         if not tefile:
223 |             teform ={'force_tsv': True, 'loc': 'location(chr=column[5], left=column[6], right=column[7])', 'annot': 10, 'clas':11}
224 |             if tefilename.endswith('.gz'):
225 |                 TEs = genelist(tefilename, format=teform, gzip=True)
226 |             else:
227 |                 TEs = genelist(tefilename, format=teform)
228 |             
229 |             keep=[]
230 |             for id,item in enumerate(TEs):
231 |                 if item['clas'] not in ['DNA','LINE','LTR','SINE','Satellite','Retroposon']:
232 |                     continue
233 |                 if item['loc']['chr'] not in chr_list:
234 |                     continue
235 |                 tmp=item.copy()
236 |                 del tmp['clas']
237 |                 keep.append(tmp)
238 |             gls=genelist()
239 |             gls.load_list(keep)
240 |         
241 |         else:
242 |             TEs = genelist(tefilename, format={'force_tsv': True, 'loc': 'location(chr=column[0], left=column[1], right=column[2])', 'annot':3})
243 |             gls = TEs.deepcopy()
244 | 
245 |         
246 |         all_annot = genes + gls
247 |         
248 |         if not outname:
249 |             all_annot.save('%s.inclusive.idx'%genome)
250 |             print('Done the index building, results output to %s.inclusive.idx \n'% genome)
251 |         else:
252 |             all_annot.save('%s.inclusive.idx'%outname)
253 |             print('Done the index building, results output to %s.inclusive.idx \n'% outname)
254 |     
255 |     elif mode == 'nointron':
256 |         raw_gene = a[0]
257 |         clean_gene ={}
258 |         for k in raw_gene:
259 |             if len(raw_gene[k]) == 1: # the gene only have one exon
260 |                 clean_gene[k] = [raw_gene[k][0]]
261 |             else:
262 |                 tmp = []
263 |                 for it in raw_gene[k]:
264 |                     tmp += it
265 |                     chr = [ item for item in tmp if 'chr' in str(item) ][0]
266 |                     tmp = [ int(item) for item in tmp if 'chr' not in str(item) ]
267 |                 clean_gene[k] = [[ chr, np.min(tmp), np.max(tmp)]]
268 |         clean = cleanexon(clean_gene)
269 |         
270 |         # adapted from 'exclusive' mode to remove the overlap reads
271 |         gene = {}
272 |         for l in clean:
273 |             chr = l['loc'].loc['chr'] 
274 |             if chr not in chr_list:
275 |                 continue
276 |             left = l['loc']['left']
277 |             rite = l['loc']['right']
278 | 
279 |             left_buck = ((left-1)//10000) * 10000
280 |             right_buck = (rite//10000) * 10000
281 |             buckets_reqd = range(left_buck, right_buck+10000, 10000)
282 | 
283 |             if chr not in gene:
284 |                 gene[chr] = {}
285 | 
286 |             if buckets_reqd:
287 |                 for buck in buckets_reqd:
288 |                     if buck not in gene[chr]:
289 |                         gene[chr][buck] = []
290 |                     gene[chr][buck].append([left, rite])
291 | 
292 |         # Process the TEs:
293 |         noverlap = []
294 |         if '.gz' in tefilename:
295 |             o = gzip.open(tefilename,'rb')
296 |         else:
297 |             o = open(tefilename,'rU')
298 | 
299 |         for n, l in enumerate(o):
300 |             if '.gz' in tefilename:
301 |                 l = l.decode('ascii')
302 |             t = l.strip().split('\t')
303 |             
304 |             if not tefile:
305 |                 chr = t[5].replace('chr', '')
306 |                 left = int(t[6])
307 |                 rite = int(t[7])
308 |                 name = t[10]
309 |                 clas=t[11]
310 |                 if clas not in ['DNA','LINE','LTR','SINE','Satellite','Retroposon']:
311 |                     continue
312 |             else:
313 |                 chr = t[0].replace('chr', '')
314 |                 left = int(t[1])
315 |                 rite = int(t[2])
316 |                 name = t[3]
317 |                 
318 |             if chr not in chr_list:
319 |                 continue
320 |             if chr not in gene: # Should be very rare
321 |                 noverlap.append({'loc': location(chr=chr, left=left, right=rite), 'annot': name})
322 |                 continue
323 |             
324 |             left_buck = ((left-1)//10000) * 10000
325 |             right_buck = (rite//10000) * 10000
326 |             buckets_reqd = range(left_buck, right_buck+10000, 10000)
327 | 
328 |             if buckets_reqd:
329 |                 i = 1
330 |                 for buck in buckets_reqd:
331 |                     if buck not in gene[chr]:
332 |                         pass
333 |                     else:
334 |                         for k in gene[chr][buck]:
335 |                             if left < k[1] and rite > k[0]:
336 |                                 i = 0
337 |                                 break
338 |                         if i == 0: # already found an overlap, so quit out;
339 |                             break
340 |                 if i == 1:
341 |                     noverlap.append({'loc': location(chr=chr, left=left, right=rite), 'annot': name})
342 |         
343 |         TEs = genelist()
344 |         TEs.load_list(noverlap)
345 |         
346 |         genes = genelist() 
347 |         genes.load_list(raw)
348 | 
349 |         all_annot = genes + TEs
350 |         
351 |         if not outname:
352 |             all_annot.save('%s.nointron.idx'%genome)
353 |             print('Done the index building, results output to %s.nointron.idx \n'% genome)
354 |         else:
355 |             all_annot.save('%s.nointron.idx'%outname)
356 |             print('Done the index building, results output to %s.nointron.idx \n'% outname)
357 |     
358 |     if not tefile:
359 |         os.system('rm %s '% tefilename)
360 |     if not genefile:
361 |         os.system('rm %s'%genefilename)
362 | 
363 | def prepare_parser():
364 |     
365 |     desc = "Build genome annotation index for scTE"
366 | 
367 |     exmp = "Example: scTE_build -te Data/TE.bed -gene Data/Gene.gtf"
368 |     
369 |     parser = argparse.ArgumentParser(prog='scTE_build',description=desc, epilog=exmp)
370 |     
371 |     optional = parser._action_groups.pop()
372 |     
373 |     optional.add_argument('-te', dest='tefile',nargs='+',
374 |                         help='Six columns bed file for transposable elements annotation. Need the -gene option.')
375 | 
376 |     optional.add_argument('-gene', dest='genefile',nargs='+',
377 |                         help='Gtf file for genes annotation. Need the -te option. Mutalluy exclusive to -x option')
378 |                         
379 |     optional.add_argument('-m','--mode', dest='mode', type=str, nargs='?', default='exclusive', choices=['inclusive','exclusive','nointron'],
380 |                         help='How to count TEs expression: inclusive (inclued all reads that can map to TEs), or exclusive (exclued the reads that can map to the exon of protein coding genes and lncRNAs), or nointron (exclude the reads that can map to the exons and intron of genes).\
381 |                         DEFAULT: exclusive')
382 | 
383 |     optional.add_argument('-o','--out', dest='out', nargs='?', help='Output file prefix, Default: the genome name')
384 | 
385 |     optional.add_argument('-g','--genome',  dest='genome',type=str, nargs='?',default='other',choices=['other','mm10','hg38','panTro6','macFas5','dm6','danRer11','xenTro9'],
386 |                           help='Possible Genomes: mm10 (mouse), hg38 (human), panTro6 (Chimpanzee), macFas5 (Macaca fascicularis), dm6 (Drosophila melanogaster), danRer11 (Zebrafish), xenTro9 (Xenopus tropicalis)', )
387 | 
388 | #     required = parser.add_argument_group('required arguments')
389 | #     
390 | #     required.add_argument('-g','--genome',  dest='genome',type=str, nargs='?', choices=['hg38','mm10','macFas5','dm6','other'],required=True,
391 | #                         help='Possible Genomes: mm10 (mouse), hg38 (human)')
392 | 
393 |     parser._action_groups.append(optional)
394 |     optional = parser.add_argument_group('optional arguments')
395 |     optional
396 |     
397 |     return parser
398 | 
399 | def main():
400 |     timestart=datetime.datetime.now()
401 |     args=read_opts(prepare_parser())
402 |     
403 |     print(args)
404 | #     if not args.genome:
405 | #         print('good')
406 | #     
407 | #     print(args.genome)
408 | 
409 |     assert sys.version_info >= (3, 6), 'Python >=3.6 is required'
410 | 
411 |     info = args.info
412 | 
413 |     info("Building the scTE genome annotation index... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
414 |     
415 |     if args.tefile:
416 |         tefile = args.tefile[0]
417 |     else:
418 |         tefile = None
419 |         
420 |     if args.genefile:
421 |         genefile = args.genefile[0]
422 |     else:
423 |         genefile = None
424 |     
425 |     if args.genome == 'mm10':
426 |         genomeIndex(args.genome,args.mode,tefile,genefile, args.out,
427 |                     'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M21/gencode.vM21.annotation.gtf.gz',
428 |                     'http://hgdownload.soe.ucsc.edu/goldenPath/mm10/database/rmsk.txt.gz')
429 |     
430 |     elif args.genome == 'hg38':
431 |         genomeIndex(args.genome,args.mode,tefile,genefile, args.out,
432 |                     'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/gencode.v30.annotation.gtf.gz',
433 |                     'http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/rmsk.txt.gz')
434 |     
435 |     elif args.genome == 'panTro6':
436 |         genomeIndex(args.genome,args.mode,tefile,genefile, args.out,
437 |                     'http://ftp.ensembl.org/pub/release-103/gtf/pan_troglodytes/Pan_troglodytes.Pan_tro_3.0.103.gtf.gz',
438 |                     'https://hgdownload.soe.ucsc.edu/goldenPath/panTro6/database/rmsk.txt.gz')
439 | 
440 |     elif args.genome == 'macFas5':
441 |         genomeIndex(args.genome,args.mode,tefile,genefile, args.out,
442 |                     'http://ftp.ensembl.org/pub/release-102/gtf/macaca_fascicularis/Macaca_fascicularis.Macaca_fascicularis_5.0.102.gtf.gz',
443 |                     'http://hgdownload.soe.ucsc.edu/goldenPath/macFas5/database/rmsk.txt.gz')
444 | 
445 |     elif args.genome == 'dm6':
446 |         genomeIndex(args.genome,args.mode,tefile,genefile, args.out,
447 |                     'http://ftp.ensembl.org/pub/release-103/gtf/drosophila_melanogaster/Drosophila_melanogaster.BDGP6.32.103.gtf.gz',
448 |                     'http://hgdownload.soe.ucsc.edu/goldenPath/dm6/database/rmsk.txt.gz')
449 | 
450 |     elif args.genome == 'danRer11':
451 |         genomeIndex(args.genome,args.mode,tefile,genefile, args.out,
452 |                     'http://ftp.ensembl.org/pub/release-103/gtf/danio_rerio/Danio_rerio.GRCz11.103.gtf.gz',
453 |                     'https://hgdownload.soe.ucsc.edu/goldenPath/danRer11/database/rmsk.txt.gz')
454 | 
455 |     elif args.genome == 'xenTro9':
456 |         genomeIndex(args.genome,args.mode,tefile,genefile, args.out,
457 |                     'http://ftp.ensembl.org/pub/release-103/gtf/xenopus_tropicalis/Xenopus_tropicalis.Xenopus_tropicalis_v9.1.103.gtf.gz',
458 |                     'https://hgdownload.soe.ucsc.edu/goldenPath/xenTro9/database/rmsk.txt.gz')
459 | 
460 |     elif args.genome == 'other':
461 |         genomeIndex(args.genome,args.mode,tefile,genefile, args.out,'No path','No path')
462 | 
463 | 
464 |     info("Done genome annotation index building... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
465 | 
466 | if __name__ == '__main__':
467 |     try:
468 |         main()
469 |     except KeyboardInterrupt:
470 |         sys.stderr.write("User interrupt !\n")
471 |         sys.exit(0)
472 | 
473 | 
474 | 
475 | 


--------------------------------------------------------------------------------
/docs/scTE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiekaiLab/scTE/566f6ab3baaf76cd006ab965edc08e4576eb73c9/docs/scTE.png


--------------------------------------------------------------------------------
/example/Figure3/0.cluster_scripts/scte/do_batch.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | for f in  ../starsolo*/*.bam
 4 | do
 5 |     root=`basename $f`
 6 |     path=`dirname $f`
 7 |     
 8 |     bf=`echo $root | sed -r 's#.Aligned.sortedByCoord.out.bam##g' | sed 's#.bam##g'`
 9 |     tt=`echo $bf.csv.gz ` # outfile
10 |     if [ ! -f $tt ] # Check not already done
11 |     then
12 |         echo scTE $tt
13 |         qsub -N scte.$bf -v in=$f,out=$bf scte.sh
14 |         sleep 1
15 |     fi
16 | done
17 | 
18 | 


--------------------------------------------------------------------------------
/example/Figure3/0.cluster_scripts/scte/scte.sh:
--------------------------------------------------------------------------------
 1 | #PBS -l nodes=1:ppn=2,mem=64gb
 2 | #PBS -j oe
 3 | #PBS -o ${out}.out
 4 | #PBS -q batch
 5 | #PBS -V 
 6 | cd $PBS_O_WORKDIR
 7 | 
 8 | genome_mm10='/data3/lab-andrew/scTE/scte_indeces/mm10.exclusive.idx'
 9 | genome_hg38='/data3/lab-andrew/scTE/scte_indeces/hg38.exclusive.idx'
10 | 
11 | python3 /share/apps/genomics/unstable/scTE/bin/scTE -i ${in} -x $genome_mm10 -g mm10  -p 1 -o ${out}
12 | 
13 | gzip ${out}.csv
14 | 


--------------------------------------------------------------------------------
/example/Figure3/0.cluster_scripts/starsolo/do_batch.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | for f in  ../fqs/*.p1.fq.gz
 4 | do
 5 |     root=`basename $f`
 6 |     path=`dirname $f`
 7 |     
 8 |     bf=`echo $root | sed -r 's#.p1.fq.gz##g'`
 9 |     p2=`echo $f | sed 's#.p1.fq.gz#.p2.fq.gz#g'`
10 |     tt=`echo ss.$bf.Aligned.sortedByCoord.out.bam` # outfile
11 |     if [ ! -f $tt ] # Check not already done
12 |     then
13 |         echo STARsolo $tt
14 |         qsub -N solo.$bf -v p1=$f,p2=$p2,out=$bf. starsolo.sh
15 |         sleep 2
16 |     fi
17 | done
18 | 
19 | 


--------------------------------------------------------------------------------
/example/Figure3/0.cluster_scripts/starsolo/starsolo.sh:
--------------------------------------------------------------------------------
 1 | #PBS -N ss.${out}.starsolo
 2 | #PBS -l nodes=1:ppn=32
 3 | #PBS -l mem=32gb
 4 | #PBS -j oe
 5 | #PBS -o ss.${out}.out
 6 | #PBS -q batch
 7 | #PBS -V 
 8 | cd $PBS_O_WORKDIR
 9 | 
10 | ulimit -n 2000
11 | 
12 | whitelist='--soloCBwhitelist /data3/lab-andrew/scTE/scrnaseq_barcodes/version1.txt' # Make sure you get the right bartcode version
13 | 
14 | # Required arguments;
15 | mods='--soloType Droplet --soloFeatures Gene --soloBarcodeReadLength 1 --soloCBlen 14 --soloUMIstart 15 '
16 | teopts=' --outFilterMultimapNmax 100 --winAnchorMultimapNmax 100 --outSAMmultNmax 1 --outSAMtype BAM SortedByCoordinate --twopassMode Basic' 
17 | opts='--runRNGseed 42 --runThreadN 32 --readFilesCommand zcat '
18 | 
19 | # required for scTE:
20 | sam_att='--outSAMattributes NH HI AS nM CR CY UR UY'
21 | 
22 | genome_mm10='--genomeDir /data3/lab-andrew/scTE/custom_indeces/mm10_gencode_vM21_starsolo/SAindex'
23 | genome_hg38='--genomeDir /data3/lab-andrew/scTE/custom_indeces/hg38_gencode_v30_starsolo/SAindex'
24 | 
25 | # p1 = read
26 | # p2 = barcode and UMI
27 | # Make sure you set the correct genome index;
28 | STAR $opts $teopts $mods $whitelist $sam_att $genome_mm10 --outFileNamePrefix ss.${out} --readFilesIn ${p1} ${p2}
29 | 
30 | rm -r ss.${out}_STARgenome
31 | rm -r ss.${out}_STARpass1
32 | rm -r ss.${out}_STARtmp
33 | 


--------------------------------------------------------------------------------
/example/Figure3/1.pack.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Pack the scRNA-seq data using scanpy, prep for scran normalisation
  4 | 
  5 | """
  6 | 
  7 | import logging, matplotlib, os, sys
  8 | import scanpy as sc
  9 | import numpy as np
 10 | import scipy as sp
 11 | import pandas as pd
 12 | import matplotlib.pyplot as plt
 13 | from anndata import AnnData
 14 | from matplotlib import rcParams
 15 | from matplotlib import colors
 16 | import seaborn as sb
 17 | from rpy2.robjects.packages import importr
 18 | plt.rcParams['figure.figsize'] = (8,8)
 19 | sc.settings.verbosity = 3
 20 | sc.set_figure_params(dpi=200, dpi_save=200)
 21 | matplotlib.rcParams['pdf.fonttype'] = 42
 22 | matplotlib.rcParams['font.size'] = 10
 23 | sc.settings.autoshow = False
 24 | 
 25 | def sparsify(filename):
 26 |     data = pd.read_csv(filename, index_col=0, header=0)
 27 |     genes = data.columns
 28 |     cells = data.index
 29 |     data = sp.sparse.csr_matrix(data.to_numpy())
 30 |     data.astype('float32')
 31 | 
 32 |     '''
 33 |     oh = open('gene_names.{0}.tsv'.format(os.path.split(filename)[1]), 'w')
 34 |     for g in genes:
 35 |         oh.write('%s\n' % g)
 36 |     oh.close()
 37 |     '''
 38 | 
 39 |     print('Loaded {0}'.format(filename))
 40 |     ad = AnnData(data, obs={'obs_names': cells}, var={'var_names': genes})
 41 |     del data
 42 |     return ad
 43 | 
 44 | sam1 = sparsify("../scte_data/ss.gastrulation_E6.5_Sam1.csv.gz")    ; sam1.obs['stage'] = "E6.5"   ; sam1.obs['replicate'] = "E6.5-1"
 45 | sam2 = sparsify("../scte_data/ss.gastrulation_E6.5_Sam5.csv.gz")    ; sam2.obs['stage'] = "E6.5"   ; sam2.obs['replicate'] = "E6.5-2"
 46 | #sam3 = sparsify("../scte_data/ss.gastrulation_E6.5_Sam18.csv.gz")   ; sam3.obs['stage'] = "E6.5"   ; sam3.obs['replicate'] = "E6.5-3"
 47 | #sam4 = sparsify("../scte_data/ss.gastrulation_E6.75_Sam7.csv.gz")   ; sam4.obs['stage'] = "E6.75"  ; sam4.obs['replicate'] = "E6.75-1"
 48 | sam5 = sparsify("../scte_data/ss.gastrulation_E7.0_Sam10.csv.gz")   ; sam5.obs['stage'] = "E7.0"   ; sam5.obs['replicate'] = "E7.0-1"
 49 | #sam6 = sparsify("../scte_data/ss.gastrulation_E7.0_Sam15.csv.gz")   ; sam6.obs['stage'] = "E7.0"   ; sam6.obs['replicate'] = "E7.0-3"
 50 | sam7 = sparsify("../scte_data/ss.gastrulation_E7.0_Sam30.csv.gz")   ; sam7.obs['stage'] = "E7.0"   ; sam7.obs['replicate'] = "E7.0-4"
 51 | sam8 = sparsify("../scte_data/ss.gastrulation_E7.0_Sam31.csv.gz")   ; sam8.obs['stage'] = "E7.0"   ; sam8.obs['replicate'] = "E7.0-5"
 52 | sam9 = sparsify("../scte_data/ss.gastrulation_E7.0_Sam32.csv.gz")   ; sam9.obs['stage'] = "E7.0"   ; sam9.obs['replicate'] = "E7.0-6"
 53 | sam10 = sparsify("../scte_data/ss.gastrulation_E7.25_Sam23.csv.gz") ; sam10.obs['stage'] = "E7.25" ; sam10.obs['replicate'] = "E7.25-2"
 54 | sam11 = sparsify("../scte_data/ss.gastrulation_E7.25_Sam26.csv.gz") ; sam11.obs['stage'] = "E7.25" ; sam11.obs['replicate'] = "E7.25-3"
 55 | sam12 = sparsify("../scte_data/ss.gastrulation_E7.25_Sam27.csv.gz") ; sam12.obs['stage'] = "E7.25" ; sam12.obs['replicate'] = "E7.25-4"
 56 | sam13 = sparsify("../scte_data/ss.gastrulation_E7.5_Sam2.csv.gz")   ; sam13.obs['stage'] = "E7.5"  ; sam13.obs['replicate'] = "E7.5-1"
 57 | sam14 = sparsify("../scte_data/ss.gastrulation_E7.5_Sam3.csv.gz")   ; sam14.obs['stage'] = "E7.5"  ; sam14.obs['replicate'] = "E7.5-2"
 58 | sam15 = sparsify("../scte_data/ss.gastrulation_E7.5_Sam4.csv.gz")   ; sam15.obs['stage'] = "E7.5"  ; sam15.obs['replicate'] = "E7.5-3"
 59 | sam16 = sparsify("../scte_data/ss.gastrulation_E7.5_Sam6.csv.gz")   ; sam16.obs['stage'] = "E7.5"  ; sam16.obs['replicate'] = "E7.5-4"
 60 | sam17 = sparsify("../scte_data/ss.gastrulation_E7.5_Sam19.csv.gz")  ; sam17.obs['stage'] = "E7.5"  ; sam17.obs['replicate'] = "E7.5-5"
 61 | sam18 = sparsify("../scte_data/ss.gastrulation_E7.5_Sam20.csv.gz")  ; sam18.obs['stage'] = "E7.5"  ; sam18.obs['replicate'] = "E7.5-6"
 62 | sam19 = sparsify("../scte_data/ss.gastrulation_E7.75_Sam8.csv.gz")  ; sam19.obs['stage'] = "E7.75" ; sam19.obs['replicate'] = "E7.75-1"
 63 | sam20 = sparsify("../scte_data/ss.gastrulation_E7.75_Sam9.csv.gz")  ; sam20.obs['stage'] = "E7.75" ; sam20.obs['replicate'] = "E7.75-2"
 64 | sam21 = sparsify("../scte_data/ss.gastrulation_E7.75_Sam12.csv.gz") ; sam21.obs['stage'] = "E7.75" ; sam21.obs['replicate'] = "E7.75-3"
 65 | sam22 = sparsify("../scte_data/ss.gastrulation_E7.75_Sam13.csv.gz") ; sam22.obs['stage'] = "E7.75" ; sam22.obs['replicate'] = "E7.75-4"
 66 | sam23 = sparsify("../scte_data/ss.gastrulation_E8.0_Sam16.csv.gz")  ; sam23.obs['stage'] = "E8.0"  ; sam23.obs['replicate'] = "E8.0-1"
 67 | sam24 = sparsify("../scte_data/ss.gastrulation_E8.0_Sam33.csv.gz")  ; sam24.obs['stage'] = "E8.0"  ; sam24.obs['replicate'] = "E8.0-2"
 68 | sam25 = sparsify("../scte_data/ss.gastrulation_E8.0_Sam34.csv.gz")  ; sam25.obs['stage'] = "E8.0"  ; sam25.obs['replicate'] = "E8.0-3"
 69 | sam26 = sparsify("../scte_data/ss.gastrulation_E8.0_Sam35.csv.gz")  ; sam26.obs['stage'] = "E8.0"  ; sam26.obs['replicate'] = "E8.0-4"
 70 | sam27 = sparsify("../scte_data/ss.gastrulation_E8.25_Sam24.csv.gz") ; sam27.obs['stage'] = "E8.25" ; sam27.obs['replicate'] = "E8.25-1"
 71 | sam28 = sparsify("../scte_data/ss.gastrulation_E8.25_Sam25.csv.gz") ; sam28.obs['stage'] = "E8.25" ; sam28.obs['replicate'] = "E8.25-2"
 72 | sam29 = sparsify("../scte_data/ss.gastrulation_E8.25_Sam28.csv.gz") ; sam29.obs['stage'] = "E8.25" ; sam29.obs['replicate'] = "E8.25-3"
 73 | sam30 = sparsify("../scte_data/ss.gastrulation_E8.5_Sam17.csv.gz")  ; sam30.obs['stage'] = "E8.5"  ; sam30.obs['replicate'] = "E8.5-1"
 74 | sam31 = sparsify("../scte_data/ss.gastrulation_E8.5_Sam29.csv.gz")  ; sam31.obs['stage'] = "E8.5"  ; sam31.obs['replicate'] = "E8.5-2"
 75 | sam32 = sparsify("../scte_data/ss.gastrulation_E8.5_Sam36.csv.gz")  ; sam32.obs['stage'] = "E8.5"  ; sam32.obs['replicate'] = "E8.5-3"
 76 | sam33 = sparsify("../scte_data/ss.gastrulation_E8.5_Sam37.csv.gz")  ; sam33.obs['stage'] = "E8.5"  ; sam33.obs['replicate'] = "E8.5-4"
 77 | sam34 = sparsify("../scte_data/ss.gastrulation_mixed_Sam21.csv.gz") ; sam34.obs['stage'] = "mixed" ; sam34.obs['replicate'] = "mixed-1"
 78 | sam35 = sparsify("../scte_data/ss.gastrulation_mixed_Sam22.csv.gz") ; sam35.obs['stage'] = "mixed" ; sam35.obs['replicate'] = "mixed-2"
 79 | 
 80 | print('Loaded Samples...')
 81 | 
 82 | # Do very simple prefiltering:
 83 | samples = [sam1, sam2, #sam3, sam4,
 84 |             sam5, #sam6,
 85 |             sam7, sam8, sam9, sam10,
 86 |             sam11, sam12, sam13, sam14, sam15,
 87 |             sam16, sam17, sam18, sam19, sam20,
 88 |             sam21, sam22, sam23, sam24, sam25,
 89 |             sam26, sam27, sam28, sam29, sam30,
 90 |             sam31, sam32, sam33, sam34, sam35]
 91 | 
 92 | # Quick pre-filtering, these should be low, otherwise it can mess up downstream analysis, but also can get rid of trivial uninteresting things
 93 | [sc.pp.filter_cells(sam, min_genes=2000) for sam in samples]
 94 | [sc.pp.filter_cells(sam, max_counts=100000) for sam in samples]
 95 | [sc.pp.filter_cells(sam, min_counts=5000) for sam in samples]
 96 | # Do not filter gene here; concatenate joins on the union, so if a gene fails in a single sample, it will also be deleted from all other samples;
 97 | 
 98 | print('Concatenating')
 99 | adata = sam1.concatenate(samples[1:])
100 | 
101 | del samples
102 | 
103 | adata.X = adata.X.astype('float32')
104 | 
105 | print(adata)
106 | 
107 | sc.pl.violin(adata, ['n_genes', 'n_counts'], groupby='replicate', size=0, log=False, cut=0, show=False, save='qc1-pre-norm-replicates.pdf')
108 | 
109 | # Base filtering for trivial QC failures:
110 | sc.pp.filter_cells(adata, min_genes=3000)
111 | sc.pp.filter_cells(adata, min_counts=8000)
112 | sc.pp.filter_cells(adata, max_counts=100000)
113 | sc.pp.filter_genes(adata, min_cells=50) # Only filter genes here;
114 | 
115 | print('Number of cells after gene filter: {:d}'.format(adata.n_obs))
116 | 
117 | #sc.pl.violin(adata, ['n_genes','n_counts'], groupby='stage', size=0, log=False, cut=0, show=False, save='qc1.pdf')
118 | sc.pl.violin(adata, ['n_genes','n_counts'], groupby='replicate', size=0, log=False, cut=0, show=False, save='qc1-replicates.pdf')
119 | 
120 | p = sb.distplot(adata.obs['n_counts'], kde=False)
121 | p.get_figure().savefig('figures/distplot_ncounts1.pdf')
122 | p = sb.distplot(adata.obs['n_counts'][adata.obs['n_counts']<4000], kde=False, bins=60)
123 | p.get_figure().savefig('figures/distplot_ncounts2.pdf')
124 | p = sb.distplot(adata.obs['n_counts'][adata.obs['n_counts']>10000], kde=False, bins=60)
125 | p.get_figure().savefig('figures/distplot_ncounts3.pdf')
126 | #Thresholding decision: genes
127 | p = sb.distplot(adata.obs['n_genes'], kde=False, bins=60)
128 | p.get_figure().savefig('figures/distplot_ngenes1.pdf')
129 | p = sb.distplot(adata.obs['n_genes'][adata.obs['n_genes']<2000], kde=False, bins=60)
130 | p.get_figure().savefig('figures/distplot_ngenes2.pdf')
131 | 
132 | print('Total number of cells: {:d}'.format(adata.n_obs))
133 | print('Total number of genes: {:d}'.format(adata.n_vars))
134 | 
135 | adata.write('./raw_data.h5ad')
136 | 


--------------------------------------------------------------------------------
/example/Figure3/2.norm_and_learn.py:
--------------------------------------------------------------------------------
 1 | import logging, matplotlib, os, sys
 2 | import anndata
 3 | import scanpy as sc
 4 | import numpy as np
 5 | import scipy as sp
 6 | import pandas as pd
 7 | import matplotlib.pyplot as plt
 8 | from matplotlib import rcParams
 9 | from matplotlib import colors
10 | import seaborn as sb
11 | plt.rcParams['figure.figsize']=(8,8) #rescale figures
12 | sc.settings.verbosity = 3
13 | sc.set_figure_params(dpi=200, dpi_save=300)
14 | 
15 | adata = sc.read('raw_data.h5ad')
16 | sc.pp.normalize_total(adata)
17 | sc.pp.log1p(adata)
18 | print(adata)
19 | 
20 | print('Number of cells: {:d}'.format(adata.n_obs))
21 | 
22 | sc.pp.highly_variable_genes(adata, flavor='cell_ranger', n_top_genes=2000)
23 | sc.pl.highly_variable_genes(adata, show=False, save='highly_variable.pdf')
24 | 
25 | # Calculate the visualizations
26 | sc.pp.pca(adata, n_comps=20, use_highly_variable=True, svd_solver='arpack') # PC=20 from Nature paper
27 | sc.pp.neighbors(adata)
28 | sc.tl.tsne(adata, n_jobs=3)
29 | sc.tl.umap(adata, min_dist=0.6)
30 | sc.tl.diffmap(adata)
31 | 
32 | sc.pl.pca_variance_ratio(adata, log=True, show=False, save='pca_variance.pdf')
33 | 
34 | # Perform clustering - using highly variable genes
35 | sc.tl.leiden(adata, resolution=1.0, key_added='leiden_r1')
36 | sc.tl.leiden(adata, resolution=0.5, key_added='leiden_r0.5')
37 | sc.tl.leiden(adata, resolution=0.4, key_added='leiden_r0.4')
38 | sc.tl.leiden(adata, resolution=0.35, key_added='leiden_r0.35')
39 | sc.tl.leiden(adata, resolution=0.3, key_added='leiden_r0.3')
40 | sc.tl.leiden(adata, resolution=0.25, key_added='leiden_r0.25')
41 | sc.tl.leiden(adata, resolution=0.2, key_added='leiden_r0.2')
42 | sc.tl.leiden(adata, resolution=0.1, key_added='leiden_r0.1')
43 | 
44 | adata.write('./learned.h5ad')
45 | 
46 | todraw = ['leiden_r1', 'leiden_r0.5', 'leiden_r0.4', 'leiden_r0.35', 'leiden_r0.3', 'leiden_r0.25', 'leiden_r0.2', 'leiden_r0.1', 'replicate']
47 | 
48 | #Visualize the clustering and how this is reflected by different technical covariates
49 | sc.pl.tsne(adata, color=todraw, size=10, legend_loc='on data', show=False, save='tsne.pdf')
50 | sc.pl.umap(adata, color=todraw, size=10, legend_loc='on data', show=False, save='umap.pdf')
51 | 
52 | 


--------------------------------------------------------------------------------
/example/Figure3/3.diffexp.py:
--------------------------------------------------------------------------------
 1 | import logging, matplotlib, os, sys
 2 | import scanpy as sc
 3 | import matplotlib.pyplot as plt
 4 | from matplotlib import rcParams
 5 | from matplotlib import colors
 6 | import pandas as pd
 7 | from glbase3 import genelist
 8 | plt.rcParams['figure.figsize']=(8,8)
 9 | sc.settings.verbosity = 3
10 | sc.set_figure_params(dpi=200, dpi_save=200)
11 | matplotlib.rcParams['pdf.fonttype']=42
12 | matplotlib.rcParams['font.size']=10
13 | 
14 | sc.settings.figdir = 'diffexp'
15 | 
16 | adata = sc.read('./learned.h5ad')
17 | 
18 | sc.tl.rank_genes_groups(adata, 'leiden_r0.5', method='wilcoxon', n_genes=3000)
19 | adata.write('./de.h5ad')
20 | 
21 | adata = sc.read('./de.h5ad')
22 | 
23 | sc.pl.rank_genes_groups(adata, n_genes=25, sharey=True, show=False, save='genes-top25.pdf')
24 | sc.pl.rank_genes_groups(adata, key='rank_genes_groups', show=False, save='genes.pdf')
25 | sc.pl.rank_genes_groups_dotplot(adata, key='rank_genes_groups', show=False, save='genes-top25.pdf')
26 | 
27 | #print(pd.DataFrame(adata.uns['rank_genes_groups']))
28 | 
29 | print(pd.DataFrame(adata.uns['rank_genes_groups']['names']))
30 | 
31 | print()
32 | topall = pd.DataFrame(adata.uns['rank_genes_groups']['names']) # get all;
33 | fcs = pd.DataFrame(adata.uns['rank_genes_groups']['logfoldchanges'])
34 | padj = pd.DataFrame(adata.uns['rank_genes_groups']['pvals_adj'])
35 | 
36 | topall.to_csv('top100.csv')
37 | 
38 | # Go through and trim the TEs:
39 | 
40 | TEs = set(genelist(filename='../../TE_genes_id.mm10.txt', format={'name': 0, 'force_tsv': True})['name'])
41 | 
42 | newcols = {}
43 | 
44 | groups = list(topall.columns.values)
45 | 
46 | for group in groups:
47 |     newcols[group] = []
48 | 
49 |     t = zip([i[group] for i in adata.uns['rank_genes_groups']['names']], [i[group] for i in adata.uns['rank_genes_groups']['logfoldchanges']], [i[group] for i in adata.uns['rank_genes_groups']['pvals_adj']])
50 | 
51 |     print('Group: {0}'.format(group))
52 |     print(t)
53 | 
54 |     for item in t:
55 |         print(item)
56 |         if abs(item[1]) < 1: # fold change
57 |             continue
58 |         if item[2] > 0.01: # just in case
59 |             continue
60 | 
61 |         if item[0] in TEs:
62 |             newcols[group].append(item[0])
63 | 
64 | 
65 | # join all and draw a dotplot:
66 | joined = []
67 | for group in newcols:
68 |         joined += newcols[group]
69 | 
70 | # Need to remove duplicates, but preserver order:
71 | newl = []
72 | for i in joined:
73 |     if i not in newl:
74 |         newl.append(i)
75 | joined = newl
76 | 
77 | print(joined)
78 | sc.pl.dotplot(adata, joined, groupby='leiden_r0.5', dot_max=0.7, dendrogram=True, standard_scale='var', show=False, save='de-tes.pdf')
79 | sc.pl.matrixplot(adata, joined, groupby='leiden_r0.5', dendrogram=True, standard_scale='var', show=False, save='de-tes.pdf')
80 | 
81 | for k in joined:
82 |     sc.pl.tsne(adata, color=[k,k], size=15, legend_loc='on data', vmax=2, show=False, save='markers-{0}.pdf'.format(k))
83 |     sc.pl.umap(adata, color=[k,k], size=15, legend_loc='on data', vmax=2, show=False, save='markers-{0}.pdf'.format(k))
84 | 


--------------------------------------------------------------------------------
/example/Figure3/4.plots-allgenes.py:
--------------------------------------------------------------------------------
 1 | import logging, matplotlib, os, sys
 2 | import scanpy as sc
 3 | import matplotlib.pyplot as plt
 4 | from matplotlib import rcParams
 5 | from matplotlib import colors
 6 | 
 7 | from glbase3 import *
 8 | 
 9 | plt.rcParams['figure.figsize']=(8,8)
10 | sc.settings.verbosity = 3
11 | sc.set_figure_params(dpi=200, dpi_save=200)
12 | matplotlib.rcParams['pdf.fonttype']=42
13 | matplotlib.rcParams['font.size']=10
14 | 
15 | sc.settings.figdir = 'genes'
16 | 
17 | adata = sc.read('./learned.h5ad')
18 | print(adata)
19 | all_genes = adata.var['n_cells'].index # gene names are stored in the index
20 | 
21 | TEs = genelist(filename='../../TE_genes_id.mm10.txt', format={'name': 0, 'force_tsv': True})['name']
22 | 
23 | print(TEs)
24 | 
25 | for g in all_genes:
26 |     if g not in TEs and '(' not in g:
27 |         print(g)
28 |         sc.pl.umap(adata, color=[g], size=6, legend_loc='on data', color_map='plasma', show=False, save='-{0}.pdf'.format(g), vmin=0, vmax=3)
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/example/Figure3/4.plots-alltes.py:
--------------------------------------------------------------------------------
 1 | import logging, matplotlib, os, sys
 2 | import scanpy as sc
 3 | import matplotlib.pyplot as plt
 4 | from matplotlib import rcParams
 5 | from matplotlib import colors
 6 | 
 7 | from glbase3 import *
 8 | 
 9 | plt.rcParams['figure.figsize']=(8,8)
10 | sc.settings.verbosity = 3
11 | sc.set_figure_params(dpi=200, dpi_save=200)
12 | matplotlib.rcParams['pdf.fonttype']=42
13 | matplotlib.rcParams['font.size']=10
14 | 
15 | sc.settings.figdir = 'tes'
16 | 
17 | adata = sc.read('./learned.h5ad')
18 | print(adata)
19 | all_genes = adata.var['n_cells'].index # gene names are stored in the index
20 | 
21 | TEs = genelist(filename='TE_genes_id.mm10.txt.gz', format={'name': 0, 'force_tsv': True}, gzip=True)
22 | 
23 | #merker_tes = ['ID2', 'MER5C1', 'MER34B-int', 'MER63D', 'MT2A']
24 | #sc.pl.stacked_violin(adata, var_names=merker_tes, groupby='leiden_r0.2', rotation=90, show=False, save='tes.pdf')
25 | 
26 | for te in TEs:
27 |     print(te['name'])
28 |     if te['name'] in all_genes:
29 |         sc.pl.umap(adata, color=[te['name'], te['name']], size=10, legend_loc='on data', show=False, save='TE-{0}.pdf'.format(te['name']), vmin=0, vmax=3)
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/example/Figure3/4.plots-specific-tes.py:
--------------------------------------------------------------------------------
 1 | import logging, matplotlib, os, sys
 2 | import scanpy as sc
 3 | import matplotlib.pyplot as plt
 4 | from matplotlib import rcParams
 5 | from matplotlib import colors
 6 | 
 7 | from glbase3 import *
 8 | 
 9 | plt.rcParams['figure.figsize']=(8,8)
10 | sc.settings.verbosity = 3
11 | sc.set_figure_params(dpi=200, dpi_save=200)
12 | matplotlib.rcParams['pdf.fonttype']=42
13 | matplotlib.rcParams['font.size']=6
14 | 
15 | sc.settings.figdir = 'specific-tes'
16 | 
17 | adata = sc.read('./learned.h5ad')
18 | 
19 | # high, few: Expressed rarely, but very high in the cells that they are expressed in
20 | marker_genes_dictB = {
21 |     #'Epiblast': ['MTEb-int',],
22 |     'Primitive streak': ['RLTR1D2_MM', ],
23 |     #'Endothelium': ['ERVB7_2B-LTR_MM',],
24 | 
25 |     #'Ectoderms': ['MamRep137'],
26 |     #'Endoderms': ['MLT1I'],
27 |     'Mesoendoderm': ['RLTR48A', 'IAPEY4_LTR', 'ORR1F-int'],
28 |     'Extraembryonic': ['LTR16A', ],
29 |     'Exe. endoderm': ['MER5C', 'RLTR6B_Mm',],
30 |     #'Exe. ectoderm': ['ERVB4_2-LTR_MM', ],
31 |     'Cardiomyocyte': ['L1ME3D', 'RLTR13A2', 'ERVB2_1A-I_MM-int', 'RLTR16'],
32 |     }
33 | sc.pl.dotplot(adata, marker_genes_dictB, groupby='leiden_r0.5', dot_max=0.3, dendrogram=True, standard_scale='var', vmax=1, show=False, save='markersB.pdf')
34 | 
35 | # Super-specific
36 | marker_genes_dictC = {
37 |     #'Primitive streak': [ ],
38 |     'Mesoendoderm': ['ERVB4_1C-LTR_Mm', 'ETnERV3-int',],
39 |     #'others':['MuRRS4-int'],
40 |     'Exe. endoderm': ['MER46C', 'MuRRS4-int',  'RLTR20B3',  'RLTR1B-int', 'LTRIS2',],
41 |     'Exe. ectoderm': ['RLTR45', 'RLTR45-int', 'IAPLTR1_Mm'],
42 |     #'Cardiomyocyte': ['ETnERV3-int', 'L1ME3D', 'RLTR13A2', 'ERVB2_1A-I_MM-int'],
43 |     'Erythroid': ['RLTR10F', 'L1_Mur1',],
44 |     }
45 | sc.pl.dotplot(adata, marker_genes_dictC, groupby='leiden_r0.5', dot_max=0.7, dendrogram=True, standard_scale='var', vmax=1, show=False, save='markersC.pdf')
46 | 


--------------------------------------------------------------------------------
/example/Figure3/5.marker_genes-leiden-0.2.py:
--------------------------------------------------------------------------------
 1 | import logging, matplotlib, os, sys
 2 | import scanpy as sc
 3 | import numpy as np
 4 | import scipy as sp
 5 | import pandas as pd
 6 | import matplotlib.pyplot as plt
 7 | from matplotlib import rcParams
 8 | from matplotlib import colors
 9 | import seaborn as sb
10 | from rpy2.robjects.packages import importr
11 | #from gprofiler import gprofiler
12 | plt.rcParams['figure.figsize']=(8,8) #rescale figures
13 | sc.settings.verbosity = 1
14 | sc.set_figure_params(dpi=200, dpi_save=300)
15 | 
16 | sc.settings.figdir = 'markers-leiden0.2'
17 | 
18 | adata = sc.read('learned.h5ad') #
19 | #sc.pp.log1p(adata)
20 | 
21 | print(adata.var_names)
22 | 
23 | oh = open('gene_names.all.tsv', 'w')
24 | for g in adata.var_names:
25 |     oh.write('%s\n' % g)
26 | oh.close()
27 | 
28 | marker_genes_dict = {
29 |     'Epiblast': ["Pou5f1"], # Done
30 |     'Primitive Streak': ['Mixl1'], # Done
31 |     'Meso/endoderm': ['Eomes', 'T'], # Done
32 |     'Endoderm': ['Sox17'], # Done
33 |     'Mesoderm': ['Tbx6'], # Done
34 |     'Ectoderm': ['Nr2f1', 'Pax6'],
35 |     'Exe. endoderm': ["Apoa2"], # Done
36 |     'Exe. ectoderm': ["Tfap2c"], # Done
37 |     'Mesenchyme': ['Pmp22'], # Done
38 |     'Blood progenitors': ['Runx1'], # Done
39 |     'Erythroid': ['Gata1'], # Done
40 |     }
41 | 
42 | sc.pl.stacked_violin(adata, marker_genes_dict, groupby='leiden_r0.2', vmax=3, rotation=90, dendrogram=False, show=False, save='markers.pdf')
43 | sc.pl.dotplot(adata, marker_genes_dict, groupby='leiden_r0.2', dot_max=0.5, dendrogram=False, standard_scale='var', show=False, save='markers.pdf')
44 | sc.pl.heatmap(adata, marker_genes_dict, groupby='leiden_r0.2', vmax=3, show=False, save='markers.pdf')
45 | '''
46 | for k in marker_genes_dict:
47 |     sc.pl.tsne(adata, color=marker_genes_dict[k], size=10, legend_loc='on data', vmax=3, show=False, save='markers-{0}.pdf'.format(k))
48 |     sc.pl.umap(adata, color=marker_genes_dict[k], color_map='plasma', size=10, vmax=3, legend_loc='on data', show=False, save='markers-{0}.pdf'.format(k))
49 | 
50 | '''
51 | 


--------------------------------------------------------------------------------
/example/Figure3/5.marker_genes-small-grp_cut.py:
--------------------------------------------------------------------------------
 1 | import logging, matplotlib, os, sys
 2 | import scanpy as sc
 3 | import numpy as np
 4 | import scipy as sp
 5 | import pandas as pd
 6 | import matplotlib.pyplot as plt
 7 | from matplotlib import rcParams
 8 | from matplotlib import colors
 9 | import seaborn as sb
10 | #from rpy2.robjects.packages import importr
11 | #from gprofiler import gprofiler
12 | plt.rcParams['figure.figsize']=(8,8) #rescale figures
13 | sc.settings.verbosity = 1
14 | sc.set_figure_params(dpi=200, dpi_save=300)
15 | 
16 | #matplotlib.rcParams['pdf.fonttype']=42
17 | #matplotlib.rcParams['font.size']=6
18 | 
19 | todo = 'leiden_r0.3'
20 | 
21 | sc.settings.figdir = 'markers-{0}'.format(todo)
22 | 
23 | adata = sc.read('learned.h5ad')
24 | 
25 | marker_genes_dict = {
26 |     'Epiblast': ["Pou5f1"],
27 |     'Primitive streak': ["Mixl1"], #Nanong?!?!
28 |     'Endoderms': ["Cer1", "Sox7"],
29 |     'Mesoderms': ["T", 'Cdx1'],
30 |     'Ectoderms': ['Six3'], # And Grhl2
31 | 
32 |     'Exe endoderm': ["Apoa2"],
33 |     'Exe ectoderm': ["Tfap2c"],
34 | 
35 |     'Cardiomyocytes': ["Tnnt2"],
36 |     'Blood prog.': ["Lmo2", ],
37 |     'Erythroid': ["Gypa"],
38 |     }
39 | 
40 | sc.pl.stacked_violin(adata, marker_genes_dict, groupby=todo, rotation=90, dendrogram=True, show=False, save='markers.pdf')
41 | sc.pl.dotplot(adata, marker_genes_dict, groupby=todo, color_map='Greens', dot_max=0.7, dendrogram=True, standard_scale='var', show=False, save='markers.pdf')
42 | sc.pl.heatmap(adata, marker_genes_dict, groupby=todo, vmax=3, show=False, save='markers.pdf')
43 | 
44 | # high, few: Expressed rarely, but very high in the cells that they are expressed in
45 | marker_genes_dictB = {
46 |     #'Epiblast': ['MTEb-int',],
47 |     #'Primitive streak': ['RLTR1D2_MM', ],
48 |     #'Endothelium': ['ERVB7_2B-LTR_MM',],
49 | 
50 |     #'Ectoderms': ['MamRep137'],
51 |     #'Endoderms': ['MLT1I'],
52 |     'Mesoendoderm': ['RLTR48A', 'IAPEY4_LTR', 'ORR1F-int'],
53 |     'Extraembryonic': ['LTR16A', ],
54 |     'Exe. endoderm': ['MER5C', 'RLTR6B_Mm',],
55 |     #'Exe. ectoderm': ['ERVB4_2-LTR_MM', ],
56 |     'Cardiomyocyte': ['L1ME3D', 'RLTR13A2', 'ERVB2_1A-I_MM-int', 'RLTR16'],
57 |     }
58 | sc.pl.dotplot(adata, marker_genes_dictB, groupby=todo, dot_max=0.3, dendrogram=True, standard_scale='var', vmax=1, show=False, save='markersB.pdf')
59 | 
60 | # Super-specific
61 | marker_genes_dictC = {
62 |     #'Primitive streak': [ ],
63 |     'Mesoendoderm': ['ERVB4_1C-LTR_Mm', 'ETnERV3-int',],
64 |     #'others':['MuRRS4-int'],
65 |     'Exe. endoderm': ['MER46C', 'MuRRS4-int',  'RLTR20B3',  'RLTR1B-int', 'LTRIS2',],
66 |     'Exe. ectoderm': ['RLTR45', 'RLTR45-int', 'IAPLTR1_Mm'],
67 |     #'Cardiomyocyte': ['ETnERV3-int', 'L1ME3D', 'RLTR13A2', 'ERVB2_1A-I_MM-int'],
68 |     'Erythroid': ['RLTR10F', 'L1_Mur1',],
69 |     }
70 | sc.pl.dotplot(adata, marker_genes_dictC, groupby=todo, dot_max=0.7, dendrogram=True, standard_scale='var', vmax=1, show=False, save='markersC.pdf')
71 | 


--------------------------------------------------------------------------------
/example/Figure3/5.marker_genes-small.py:
--------------------------------------------------------------------------------
 1 | import logging, matplotlib, os, sys
 2 | import scanpy as sc
 3 | import numpy as np
 4 | import scipy as sp
 5 | import pandas as pd
 6 | import matplotlib.pyplot as plt
 7 | from matplotlib import rcParams
 8 | from matplotlib import colors
 9 | import seaborn as sb
10 | from rpy2.robjects.packages import importr
11 | #from gprofiler import gprofiler
12 | plt.rcParams['figure.figsize']=(8,8) #rescale figures
13 | sc.settings.verbosity = 1
14 | sc.set_figure_params(dpi=200, dpi_save=300)
15 | 
16 | sc.settings.figdir = 'markers-small'
17 | 
18 | adata = sc.read('learned.h5ad')
19 | 
20 | marker_genes_dict = {
21 |     'Epiblast': ["Pou5f1"],
22 |     'Primitive streak': ["Eomes", "Mixl1"], #Nanong?!?!
23 |     'Endoderms': ["Cer1", "Sox7"],
24 |     'Mesoderms': ["T", 'Cdx1'],
25 |     'Ectoderms': ['Grhl2', 'Six3'],
26 | 
27 |     'Exe endoderm': ["Apoa2"],
28 |     'Exe ectoderm': ["Tfap2c"],
29 | 
30 |     'Cardiomyocytes': ["Tnnt2"],
31 |     'Blood prog.': ["Lmo2", ],
32 |     'Erythroid': ["Gypa"],
33 |     }
34 | 
35 | sc.pl.stacked_violin(adata, marker_genes_dict, groupby='leiden_r0.5', rotation=90, dendrogram=True, show=False, save='markers.pdf')
36 | sc.pl.dotplot(adata, marker_genes_dict, groupby='leiden_r0.5', color_map='Greens', dot_max=0.5, dendrogram=True, standard_scale='var', show=False, save='markers.pdf')
37 | sc.pl.heatmap(adata, marker_genes_dict, groupby='leiden_r0.5', vmax=3, show=False, save='markers.pdf')
38 | 
39 | for k in marker_genes_dict:
40 |     sc.pl.tsne(adata, color=marker_genes_dict[k], size=10, legend_loc='on data', vmax=3, show=False, save='markers-{0}.pdf'.format(k))
41 |     sc.pl.umap(adata, color=marker_genes_dict[k], color_map='plasma', size=10, vmax=3, legend_loc='on data', show=False, save='markers-{0}.pdf'.format(k))
42 | 


--------------------------------------------------------------------------------
/example/Figure3/5.marker_genes.py:
--------------------------------------------------------------------------------
 1 | import logging, matplotlib, os, sys
 2 | import scanpy as sc
 3 | import numpy as np
 4 | import scipy as sp
 5 | import pandas as pd
 6 | import matplotlib.pyplot as plt
 7 | from matplotlib import rcParams
 8 | from matplotlib import colors
 9 | import seaborn as sb
10 | from rpy2.robjects.packages import importr
11 | #from gprofiler import gprofiler
12 | plt.rcParams['figure.figsize']=(8,8) #rescale figures
13 | sc.settings.verbosity = 1
14 | sc.set_figure_params(dpi=200, dpi_save=300)
15 | 
16 | sc.settings.figdir = 'markers'
17 | 
18 | adata = sc.read('learned.h5ad') # You can skip the script 3 if using te 2b.
19 | #sc.pp.log1p(adata)
20 | 
21 | print(adata.var_names)
22 | 
23 | oh = open('gene_names.all.tsv', 'w')
24 | for g in adata.var_names:
25 |     oh.write('%s\n' % g)
26 | oh.close()
27 | 
28 | marker_genes_dict = {
29 |     'Epiblast': ["Pou5f1", "Epcam"],
30 |     'Primitive streak': ["Eomes", "Nanog"], #Nanog?!?!
31 |     'Anterior primitive streak': ["Gsc", "Mixl1"],
32 |     'Notochord': ["Noto", "T"],
33 |     'Def. Endoderm': ["Cer1", "Sox7"],
34 |     'Nascent mesoderm': ["Mesp1", "Apela"],
35 |     'Caudal mesoderm': ["Cdx1", "Hes7"],
36 |     'Paraxial mesoderm': ["Tcf15", "Tbx1"],
37 |     'Somitic mesoderm': ["Tbx6", "Dll1"],
38 |     'Pharngyeal mesoderm': ["Tcf21", "Isl1"],
39 |     'Cardiomyocytes': ["Tnnt2", "Myl4"],
40 |     'Allantois': ["Tbx4", "Hoxa11"],
41 |     'Mesenchyme': ["Krt18", "Pmp22"],
42 |     'Hemandothelial prog.': ["Kdr", "Etv2"],
43 |     'Endothelium': ["Pecam1", "Anxa5"],
44 |     'Blood prog.': ["Runx1", "Lmo2"],
45 |     'Erythroid': ["Gata1", "Gypa"],
46 |     'Neuromesoderml prog.': ["Cdx4", "Epha5"],
47 |     'Neurectoderm': ["Six3", "Irx3"],
48 |     'Neural crest': ["Dlx2", "Sox10"],
49 |     'Brain': ["En1", "Pax2"],
50 |     'Spinal cord': ["Sox2", "Pax2"],
51 |     'Surface ectoderm': ["Trp63", "Grhl2"],
52 |     'Visceral endoderm': ["Dkk1", "Amot"],
53 |     'Exe endoderm': ["Ttr", "Apoa2"],
54 |     'Exe ectoderm': ["Tfap2c", "Elf5"],
55 |     'Parietal endoderm': ["Sparc", "Plat"],
56 |     'others': ['Fgf5', 'Lefty2'],
57 |     }
58 | 
59 | sc.pl.stacked_violin(adata, marker_genes_dict, groupby='leiden_r0.5', rotation=90, dendrogram=True, show=False, save='markers.pdf')
60 | sc.pl.dotplot(adata, marker_genes_dict, groupby='leiden_r0.5', dot_max=0.5, dendrogram=True, standard_scale='var', show=False, save='markers.pdf')
61 | sc.pl.heatmap(adata, marker_genes_dict, groupby='leiden_r0.5', vmax=3, show=False, save='markers.pdf')
62 | 
63 | for k in marker_genes_dict:
64 |     sc.pl.tsne(adata, color=marker_genes_dict[k], size=10, legend_loc='on data', vmax=3, show=False, save='markers-{0}.pdf'.format(k))
65 |     sc.pl.umap(adata, color=marker_genes_dict[k], color_map='plasma', size=10, vmax=3, legend_loc='on data', show=False, save='markers-{0}.pdf'.format(k))
66 | 
67 | 


--------------------------------------------------------------------------------
/example/Figure3/TE_genes_id.mm10.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiekaiLab/scTE/566f6ab3baaf76cd006ab965edc08e4576eb73c9/example/Figure3/TE_genes_id.mm10.txt.gz


--------------------------------------------------------------------------------
/scTE/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | __version__ = "1.0"
3 | 
4 | # from .miniglbase import genelist, location, glload
5 | # import .miniglbase
6 | 
7 | __all__ = ["genelist", "location", "glload",]
8 | 


--------------------------------------------------------------------------------
/scTE/annotation.py:
--------------------------------------------------------------------------------
  1 | import os,sys,gzip,time
  2 | import numpy as np
  3 | from scTE.miniglbase import genelist, glload, location
  4 | 
  5 | form ={'force_tsv': True, 'loc': 'location(chr=column[0], left=column[1], right=column[2])', 'annot': 3}
  6 | 
  7 | def cleanexon(filename, genefilename, exons):
  8 |     if not os.path.exists('%s_scTEtmp/index'%filename):
  9 |         os.system('mkdir -p %s_scTEtmp/index'%filename)
 10 | 
 11 |     oh=gzip.open('%s_scTEtmp/index/%s.bed.gz'%(filename,genefilename),'wt')
 12 |     for k in sorted(exons):
 13 |         E=[]
 14 |         for it in exons[k]:
 15 |             E+=list(range(it[1],it[2]))
 16 |         E=sorted(set(E))
 17 | 
 18 |         s=0
 19 |         tmp=[]
 20 |         for id in range(0,len(E)-1):
 21 |             if E[id+1]-E[id] >1:
 22 |                 en=id
 23 |                 tmp.append([E[s],E[en]])
 24 |                 s=en+1
 25 |         tmp.append([E[s],E[id+1]])
 26 | 
 27 |         for item in tmp:
 28 |             oh.write('%s\t%s\t%s\t%s\n'%(it[0],item[0],item[1],k))
 29 |     oh.close()
 30 | 
 31 | def annoGtf(filename, genefile, tefile, mode):
 32 | 
 33 |     genefilename = genefile.split('/')[-1:][0].replace('.gtf','').replace('.gz','')
 34 |     tefilename = tefile.split('/')[-1:][0].replace('.bed','').replace('.gz','')
 35 | 
 36 |     raw = {}
 37 |     clean = {}
 38 |     if '.gz' in genefile:
 39 |         o = gzip.open(genefile,'rb')
 40 |     else:
 41 |         o=open(genefile,'rU')
 42 |     for l in o:
 43 |         if '.gz' in genefile:
 44 |             l=l.decode('ascii')
 45 |         if l.startswith('#'):
 46 |             continue
 47 |         t=l.strip().split('\t')
 48 |         if t[2]=='exon' or t[2]=='UTR':
 49 |             chr = t[0].replace('chr','')
 50 |             left = int(t[3])
 51 |             riht =  int(t[4])
 52 |             name=t[8].split('gene_name "')[1].split('";')[0]
 53 | 
 54 |             if name not in raw:
 55 |                 raw[name] = []
 56 |             raw[name].append([chr,left,riht])
 57 | 
 58 |             if 'protein_coding' not in l and 'lincRNA' not in l:
 59 |                 continue
 60 |             if name not in clean:
 61 |                 clean[name] = []
 62 |             clean[name].append([chr,left,riht])
 63 |     o.close()
 64 | 
 65 |     cleanexon(filename,'%s.raw'%genefilename,raw)
 66 |     cleanexon(filename,'%s.clean'%genefilename,clean)
 67 | 
 68 |     if mode == 'exclusive':
 69 |         gene ={}
 70 |         o = gzip.open('%s_scTEtmp/index/%s.clean.bed.gz'%(filename,genefilename),'rb')
 71 |         for l in o:
 72 |             t = l.decode('ascii').strip().split('\t')
 73 |             chr = t[0].replace('chr','')
 74 |             left = int(t[1])
 75 |             rite = int(t[2])
 76 | 
 77 |             left_buck = int((left-1)/10000) * 10000
 78 |             right_buck = int((rite)/10000) * 10000
 79 |             buckets_reqd = range(left_buck, right_buck+10000, 10000)
 80 | 
 81 |             if chr not in gene:
 82 |                 gene[chr] = {}
 83 | 
 84 |             if buckets_reqd:
 85 |                 for buck in buckets_reqd:
 86 |                     if buck not in gene[chr]:
 87 |                         gene[chr][buck] = []
 88 |                     gene[chr][buck].append([left, rite])
 89 |         o.close()
 90 | 
 91 |         noverlap = []
 92 |         if '.gz' in tefile:
 93 |             o = gzip.open(tefile,'rb')
 94 |         else:
 95 |             o = open(tefile,'rU')
 96 |         for n,l in enumerate(o):
 97 |             if '.gz' in tefile:
 98 |                 l = l.decode('ascii')
 99 |             t = l.strip().split('\t')
100 |             chr = t[0]
101 |             left = int(t[1])
102 |             rite = int(t[2])
103 |             
104 |             if chr not in gene:
105 |                 noverlap.append('%s\t%s\t%s\t%s\n'%(chr,left,rite,t[3]))
106 |                 continue
107 |             
108 |             left_buck = int((left-1)/10000) * 10000
109 |             right_buck = int((rite)/10000) * 10000
110 |             buckets_reqd = range(left_buck, right_buck+10000, 10000)
111 | 
112 |             if buckets_reqd:
113 |                 i = 1
114 |                 for buck in buckets_reqd:
115 |                     if buck not in gene[chr]:
116 |                         pass
117 |                     else:
118 |                         for k in gene[chr][buck]:
119 |                             if left < k[1] and rite > k[0]:
120 |                                 i = 0
121 |                                 break
122 |                         if i == 0:
123 |                             break
124 |                 if i == 1:
125 |                     noverlap.append('%s\t%s\t%s\t%s\n'%(chr,left,rite,t[3]))
126 | 
127 |         oh = gzip.open('%s_scTEtmp/index/%s.exclusive.gz'%(filename, tefilename),'wt')
128 |         for k in noverlap:
129 |             oh.write(k)
130 |         oh.close()
131 | 
132 |         genes = genelist('%s_scTEtmp/index/%s.raw.bed.gz'%(filename, genefilename), format=form, gzip=True)
133 |         TEs = genelist('%s_scTEtmp/index/%s.exclusive.gz'%(filename, tefilename), format=form, gzip=True)
134 |         print(genes)
135 |         print(TEs)
136 |         
137 |         all_annot = genes + TEs
138 |         all_annot.save('%s_scTEtmp/index/custome.exclusive.glb'%filename)
139 |         annot = '%s_scTEtmp/index/custome.exclusive.glb'%filename
140 | 
141 |     elif mode == 'inclusive':
142 |         genes = genelist('%s_scTEtmp/index/%s.raw.bed.gz'%(filename,genefilename), format=form, gzip=True)
143 |         if tefilename.endswith('.gz'):
144 |             TEs = genelist(tefile, format=form, gzip=True)
145 |         else:
146 |             TEs = genelist(tefile, format=form)
147 | 
148 |         all_annot = genes + TEs
149 |         all_annot.save('%s_scTEtmp/index/custome.inclusive.glb'%filename)
150 |         annot = '%s_scTEtmp/index/custome.inclusive.glb'%filename
151 | 
152 |     return annot
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 


--------------------------------------------------------------------------------
/scTE/base.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import multiprocessing
  3 | import argparse
  4 | from functools import partial
  5 | import logging
  6 | import os, sys, glob, datetime, time, gzip
  7 | import collections
  8 | from collections import defaultdict
  9 | from math import log
 10 | from scTE.miniglbase import genelist, glload, location
 11 | from scTE.annotation import annoGtf
 12 | import subprocess
 13 | 
 14 | import numpy as np
 15 | import scipy
 16 | import anndata as ad
 17 | 
 18 | def read_opts(parser):
 19 |     args = parser.parse_args()
 20 |     if args.format == "BAM" :
 21 |         args.parser = "BAM"
 22 |     elif args.format == "SAM" :
 23 |         args.parser = "SAM"
 24 |     else :
 25 |         logging.error("The input file must be SAM/BAM format: %s !\n" % (args.format))
 26 |         sys.exit(1)
 27 |     
 28 |     args.error = logging.critical
 29 |     args.warn = logging.warning
 30 |     args.debug = logging.debug
 31 |     args.info = logging.info
 32 |     
 33 |     args.argtxt ="\n".join(("Parameter list:", \
 34 |                 "Sample = %s" % (args.out), \
 35 | #                 "Genome = %s" % (args.genome), \
 36 |                 "Reference annotation index = %s" %(args.annoglb[0]), \
 37 |                 "Minimum number of genes required = %s" % (args.genenumber), \
 38 |                 "Minimum number of counts required = %s"% (args.countnumber),\
 39 |                 "Number of threads = %s " % (args.thread),\
 40 |     ))
 41 |     return args
 42 | 
 43 | # def getanno(filename, genefile, tefile, genome, mode):
 44 | #     form ={'force_tsv': True, 'loc': 'location(chr=column[0], left=column[1], right=column[2])', 'annot': 3}
 45 | # 
 46 | #     if genefile == 'default' and tefile == 'default':
 47 | #         if genome == 'mm10':
 48 | #             chr_list = ['chr'+ str(i) for i in range(1,20) ] + [ 'chrX','chrY', 'chrM' ]
 49 | #             if mode == 'exclusive':
 50 | #                 if not os.path.exists('mm10.exclusive.glb'):
 51 | #                     logging.error("Did not find the annotation index mm10.exclusive.glb, you can download it from scTE github (www....) or either give the annotation with -te and -gene option \n" )
 52 | #                     sys.exit(1)
 53 | #                 all_annot = 'mm10.exclusive.glb'
 54 | #                 allelement = set(glload(all_annot)['annot'])
 55 | # 
 56 | #             elif mode == 'inclusive':
 57 | #                 if not os.path.exists('mm10.inclusive.glb'):
 58 | #                     logging.error("Did not find the annotation index mm10.inclusive.glb, you can download it from scTE github (www....) or either give the annotation with -te and -gene option \n" )
 59 | #                     sys.exit(1)
 60 | #                 all_annot = 'mm10.inclusive.glb'
 61 | #                 allelement = set(glload(all_annot)['annot'])
 62 | # 
 63 | #         elif genome == 'hg38':
 64 | #             chr_list = ['chr'+ str(i) for i in range(1,23) ] + [ 'chrX','chrY', 'chrM' ]
 65 | #             if mode == 'exclusive':
 66 | #                 if not os.path.exists('hg38.exclusive.glb'):
 67 | #                     logging.error("Did not find the annotation index hg38.exclusive.glb, you can download it from scTE github (www....) or either give the annotation with -te and -gene option \n" )
 68 | #                     sys.exit(1)
 69 | #                 all_annot = 'hg38.exclusive.glb'
 70 | #                 allelement = set(glload(all_annot)['annot'])
 71 | # 
 72 | #             elif mode == 'inclusive':
 73 | #                 if not os.path.exists('hg38.inclusive.glb'):
 74 | #                     logging.error("Did not find the annotation index hg38.inclusive.glb, you can download it from scTE github (www....) or either give the annotation with -te and -gene option \n")
 75 | #                     sys.exit(1)
 76 | #                 all_annot = 'hg38.inclusive.glb'
 77 | #                 allelement = set(glload(all_annot)['annot'])
 78 | #     else:
 79 | #         if genome in ['hg38']:
 80 | #             chr_list = ['chr'+ str(i) for i in range(1,23) ] + [ 'chrX','chrY', 'chrM' ]
 81 | # 
 82 | #         elif genome in ['mm10']:
 83 | #             chr_list = ['chr'+ str(i) for i in range(1,20) ] + [ 'chrX','chrY', 'chrM' ]
 84 | # 
 85 | #         if not os.path.isfile(tefile) :
 86 | #             logging.error("No such file: %s !\n" %(tefile))
 87 | #             sys.exit(1)
 88 | # 
 89 | #         if not os.path.isfile(genefile) :
 90 | #             logging.error("No such file: %s !\n" % (genefile))
 91 | #             sys.exit(1)
 92 | # 
 93 | #         all_annot = annoGtf(filename, genefile=genefile, tefile=tefile, mode=mode)
 94 | #         allelement = set(glload(all_annot)['annot'])
 95 | # 
 96 | #     return(allelement,chr_list,all_annot)
 97 | 
 98 | def Readanno(filename, annoglb): #genome
 99 |     glannot = glload(annoglb)
100 |     allelement = set(glannot['annot'])
101 | #     if genome in ['mm10']:
102 | #         chr_list = ['chr'+ str(i) for i in range(1,20) ] + [ 'chrX','chrY', 'chrM' ]
103 | #     elif genome in ['hg38']:
104 | #         chr_list = ['chr'+ str(i) for i in range(1,23) ] + [ 'chrX','chrY', 'chrM' ]
105 |     
106 |     chr_list = list(set([ k['chr'] for k in glannot['loc']])) #this is useful for costume chromsome
107 |     return(allelement, chr_list, annoglb, glannot)
108 | 
109 | def checkCBUMI(filename,out,CB,UMI):
110 |     if CB == 'CR':
111 |         subprocess.run('samtools view %s | head -100| grep "CR:Z:" | wc -l > %s_scTEtmp/o1/testCR.txt'%(filename,out),shell=True)
112 |         time.sleep(2) #subprocess need take some time
113 |         o=open('%s_scTEtmp/o1/testCR.txt'%(out),'r')
114 |         for l in o:
115 |             l=l.strip()
116 |             if int(l) < 100:
117 |                 logging.error("The input file %s has no cell barcodes information, plese make sure the aligner have add the cell barcode key, or set CB to False"%filename)
118 |                 sys.exit(1)
119 |     elif CB == 'CB':
120 |         subprocess.run('samtools view %s | head -100| grep "CB:Z:" | wc -l > %s_scTEtmp/o1/testCR.txt'%(filename,out),shell=True)
121 |         time.sleep(2) #subprocess need take some time
122 |         o=open('%s_scTEtmp/o1/testCR.txt'%(out),'r')
123 |         for l in o:
124 |             l=l.strip()
125 |             if int(l) < 100:
126 |                 logging.error("The input file %s has no cell barcodes information, plese make sure the aligner have add the cell barcode key, or set CB to False"%filename)
127 |                 sys.exit(1)
128 |     
129 |     if UMI == 'UR':
130 |         subprocess.run('samtools view %s | head -100| grep "UR:Z:" | wc -l > %s_scTEtmp/o1/testUMI.txt'%(filename,out),shell=True)
131 |         time.sleep(2)
132 |         o=open('%s_scTEtmp/o1/testUMI.txt'%(out),'r')
133 |         for l in o:
134 |             l=l.strip()
135 |             if int(l) < 100:
136 |                 logging.error("The input file %s has no UR:Z information, plese make sure the aligner have add the UMI key, or set UMI to False" % filename)
137 |                 sys.exit(1)
138 |     elif UMI == 'UB':
139 |         subprocess.run('samtools view %s | head -100| grep "UB:Z:" | wc -l > %s_scTEtmp/o1/testUMI.txt'%(filename,out),shell=True)
140 |         time.sleep(2)
141 |         o=open('%s_scTEtmp/o1/testUMI.txt'%(out),'r')
142 |         for l in o:
143 |             l=l.strip()
144 |             if int(l) < 100:
145 |                 logging.error("The input file %s has no UB:Z information, plese make sure the aligner have add the UMI key, or set UMI to False" % filename)
146 |                 sys.exit(1)
147 | 
148 | def Bam2bed(filename, CB, UMI, out, num_threads):
149 |     if not os.path.exists('%s_scTEtmp/o1'%out):
150 |         os.system('mkdir -p %s_scTEtmp/o1'%out)
151 | 
152 |     sample=filename.split('/')[-1].replace('.bam','')
153 |     if sys.platform == 'darwin': # Mac OSX has BSD sed
154 |         switch = '-E'
155 |     else:
156 |         switch = '-r'
157 | 
158 |     if UMI == 'False':
159 |         if CB == 'False':
160 |             # Put the sample name in the barcode slot
161 |             os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{print $3,$4,$4+100,"%s"}\' | sed %s \'s/^chr//g\'| gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, out, switch, out, out))
162 |         elif CB == 'CR':
163 |             os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{print $3,$4,$4+100,$n}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, out, out))
164 |         elif CB == 'CB':
165 |             os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{print $3,$4,$4+100,$n}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, out, out))
166 |     elif UMI == 'UR':
167 |         if CB == 'CR':
168 |             os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UR:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/UR:Z://g\'| sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, switch, out,out))
169 |         elif CB == 'CB':
170 |             os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UR:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/UR:Z://g\'| sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, switch, out,out))
171 |     elif UMI == 'UB':
172 |         if CB == 'CR':
173 |             os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UB:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/UB:Z://g\'| sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, switch, out,out))
174 |         elif CB == 'CB':
175 |             os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UB:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/UB:Z://g\'| sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, switch, out,out))
176 | 
177 | def Para_bam2bed(filename, CB, UMI, out):
178 |     if not os.path.exists('%s_scTEtmp/o0'%out):
179 |         os.system('mkdir -p %s_scTEtmp/o0'%out)
180 | 
181 |     sample=filename.split('/')[-1].replace('.bam','')
182 |     
183 |     if sys.platform == 'darwin': # Mac OSX has BSD sed
184 |         switch = '-E'
185 |     else:
186 |         switch = '-r'
187 |     
188 |     if UMI == 'False':
189 |         if CB == 'False':
190 |             os.system('samtools view %s | awk \'{OFS="\t"}{print $3,$4,$4+100,"%s"}\' | sed %s \'s/^chr//g\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(filename, sample, switch, out, sample))
191 |         elif CB == 'CR':
192 |             os.system('samtools view %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/^chr//g\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(filename, switch, switch, out,sample))
193 |         elif CB == 'CB':
194 |             os.system('samtools view %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/^chr//g\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(filename, switch, switch, out,sample))
195 |     elif UMI == 'UR':
196 |         if CB == 'CR':
197 |             os.system('samtools view %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UR:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/UR:Z://g\' | sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(filename, switch, switch, switch, out,sample))
198 |         elif CB == 'CB':
199 |             os.system('samtools view %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UR:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/UR:Z://g\' | sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(filename, switch, switch, switch, out,sample))
200 |     elif UMI == 'UB':
201 |         if CB == 'CR':
202 |             os.system('samtools view %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UB:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/UB:Z://g\' | sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(filename, switch, switch, switch, out,sample))
203 |         elif CB == 'CB':
204 |             os.system('samtools view %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UB:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/UB:Z://g\' | sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(filename, switch, switch, switch, out,sample))
205 | 
206 | def splitAllChrs(chromosome_list, filename, genenumber, countnumber, UMI=True):
207 |     '''
208 |     **Purpose**
209 |         Split the data into separate beds, and count up all the times each barcode appears
210 | 
211 |         This variant uses more memory, but does it all at the same time and gets the filtered whitelist for free
212 | 
213 |     **Arguments**
214 |         chromosome_list
215 |             List of chromosome names
216 | 
217 |         filename (Required)
218 |             filename stub to use for tmp files
219 | 
220 |         genenumber (Required)
221 |             Minimum number of genes expressed required for a cell to pass filtering
222 | 
223 |         countnumber (Required)
224 |             Minimum number of counts required for a cell to pass filtering.
225 | 
226 |         UMI (optional, default=True)
227 |             use the UMI
228 | 
229 |     **Returns**
230 |         The barcode whitelist
231 |     '''
232 | 
233 |     if not os.path.exists('%s_scTEtmp/o2' % filename):
234 |         os.system('mkdir -p %s_scTEtmp/o2'%filename)
235 | 
236 |     chromosome_list = set([c.replace('chr', '') for c in chromosome_list])
237 | 
238 |     file_handle_in = gzip.open('%s_scTEtmp/o1/%s.bed.gz' % (filename,filename), 'rt')
239 |     file_handles_out = {chr: gzip.open('%s_scTEtmp/o2/%s.chr%s.bed.gz' % (filename,filename,chr), 'wt') for chr in chromosome_list}
240 | 
241 |     CRs = defaultdict(int)
242 | 
243 |     if UMI:
244 |         uniques = {chrom: set([]) for chrom in chromosome_list}
245 | 
246 |     # Make a BED for each chromosome
247 |     for line in file_handle_in:
248 |         t = line.strip().split('\t')
249 |         chrom = t[0].replace('chr', '') # strip chr
250 | 
251 |         if chrom not in chromosome_list: # remove the unusual chromosomes
252 |             # Force chrMT -> chrM
253 |             if chrom == 'MT':
254 |                 chrom = 'M'
255 |             else:
256 |                 continue
257 | 
258 |         if UMI:
259 |             if line in uniques[chrom]:
260 |                 continue
261 |             uniques[chrom].add(line)
262 |             CRs[t[3]] += 1
263 |         else:
264 |             CRs[t[3]] += 1
265 | 
266 |         file_handles_out[chrom].write(line)
267 | 
268 |     [file_handles_out[k].close() for k in file_handles_out]
269 |     file_handle_in.close()
270 | 
271 |     if not countnumber:
272 |         mincounts = 2 * genenumber
273 |     else:
274 |         mincounts = countnumber
275 | 
276 |     CRs = {k: v for k, v in CRs.items() if v >= mincounts}
277 | 
278 |     return list(CRs.keys())
279 | 
280 | def filterCRs(filename, genenumber, countnumber):
281 |     CRs = defaultdict(int)
282 |     for f in sorted(glob.glob('%s_scTEtmp/o2/%s*.count.gz'%(filename,filename))):
283 |         logging.info('Reading %s '%os.path.split(f)[1])
284 |         o = gzip.open(f,'rt')
285 |         for l in o:
286 |             t = l.strip().split('\t')
287 |             CRs[t[0]] += int(t[1])
288 |         o.close()
289 | 
290 |     if not countnumber:
291 |         mincounts = 2* genenumber
292 |     else:
293 |         mincounts = countnumber
294 |     
295 |     logging.info('Before filter %s'%len(CRs))
296 |     CRs = {k: v for k, v in CRs.items() if v >= mincounts}
297 |     logging.info('Aefore filter %s'%len(CRs))
298 | 
299 |     return list(CRs.keys())
300 | 
301 | def splitChr(chr, filename, CB, UMI):
302 |     if not os.path.exists('%s_scTEtmp/o2'%filename):
303 |         os.system('mkdir -p %s_scTEtmp/o2'%filename)
304 | 
305 |     chr=chr.replace('chr','')
306 |     if CB == 'CR' or CB == 'CB': CB = True
307 |     else: CB = False
308 |     if UMI == 'UR' or UMI == 'UB': UMI = True
309 |     else: UMI= False
310 |     
311 |     if not CB: # C1-style data is a cell per BAM, so no barcode;
312 |         if not UMI:
313 |             if chr == '1':
314 |                 os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^1\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
315 |             elif chr == '2':
316 |                 os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^2\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
317 |             elif chr == '3':
318 |                 os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^3\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
319 |             else:
320 |                 os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
321 |         else:
322 |             if chr == '1':
323 |                 os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^1\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
324 |             elif chr == '2':
325 |                 os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^2\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
326 |             elif chr == '3':
327 |                 os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^3\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
328 |             else:
329 |                 os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
330 |     else:
331 |         if not UMI: # did not remove the potential PCR duplicates for scRNA-seq
332 |             if chr == '1':
333 |                 os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^1\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
334 |             elif chr == '2':
335 |                 os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^2\'[0-9]\' | grep ^%s  | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
336 |             elif chr == '3':
337 |                 os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^3\'[0-9]\' | grep ^%s  | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
338 |             else:
339 |                 os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
340 |         else:
341 |             if chr == '1':
342 |                 os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^1\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
343 |             elif chr == '2':
344 |                 os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^2\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
345 |             elif chr == '3':
346 |                 os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^3\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
347 |             else:
348 |                 os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
349 | 
350 |     CRs = defaultdict(int)
351 |     o = gzip.open('%s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr),'rt')
352 |     for l in o:
353 |         t = l.strip().split('\t')
354 |         CRs[t[3]] += 1
355 |     o.close()
356 | 
357 |     o = gzip.open('%s_scTEtmp/o2/%s.chr%s.count.gz'%(filename,filename,chr),'wt')
358 |     for k in CRs:
359 |         o.write('%s\t%s\n'%(k,CRs[k]))
360 |     o.close()
361 | 
362 | def align(chr, filename, all_annot, glannot, whitelist): #CB
363 |     '''
364 |     **Purpose**
365 |         For each read, align it to the index and assign a TE, gene.
366 | 
367 |     This is the speed critical part.
368 | 
369 |     '''
370 |     s1 = time.time()
371 |     chr = 'chr' + chr
372 | 
373 |     if not os.path.exists('%s_scTEtmp/o3'%filename):
374 |         os.system('mkdir -p %s_scTEtmp/o3'%filename)
375 | 
376 |     if not glannot: # Load separately for the multicore pipeline, share the index for the single core pipeline
377 |         glannot = glload(all_annot)
378 | 
379 |     # Only keep the glbase parts we need.
380 |     buckets = glannot.buckets[chr.replace('chr', '')]
381 |     all_annot = glannot.linearData
382 | 
383 |     oh = gzip.open('%s_scTEtmp/o2/%s.%s.bed.gz' % (filename, filename, chr), 'rt')
384 |     res = {}
385 |     for line in oh:
386 |         t = line.strip().split('\t')
387 |         barcode = t[3]
388 |         if barcode not in whitelist:
389 |             continue
390 |         if barcode not in res:
391 |             res[barcode] = defaultdict(int)
392 | 
393 |         #chrom = t[0].replace('chr', '') # Don't need as each align is already split for each chrom;
394 |         left = int(t[1])
395 |         rite = int(t[2])
396 | 
397 |         #loc = location(chr=chrom, left=left, right=rite)
398 |         left_buck = ((left-1)//10000) * 10000
399 |         right_buck = ((rite)//10000) * 10000
400 |         buckets_reqd = range(left_buck, right_buck+10000, 10000)
401 | 
402 |         if buckets_reqd:
403 |             loc_ids = set()
404 |             loc_ids_update = loc_ids.update
405 | 
406 |             # get the ids reqd.
407 |             [loc_ids_update(buckets[buck]) for buck in buckets_reqd if buck in buckets]
408 | 
409 |             result = [all_annot[index]['annot'] for index in loc_ids if (rite >= all_annot[index]['loc'].loc['left'] and left <= all_annot[index]['loc'].loc["right"])]
410 | 
411 |             if result:
412 |                 for gene in result:
413 |                     res[barcode][gene] += 1
414 | 
415 |     oh.close()
416 | 
417 |     oh = gzip.open('%s_scTEtmp/o3/%s.%s.bed.gz' % (filename,filename,chr), 'wt')
418 |     for bc in sorted(res):
419 |         for gene in sorted(res[bc]):
420 |             oh.write('%s\t%s\t%s\n' % (bc, gene, res[bc][gene]))
421 |     oh.close()
422 | 
423 | def Countexpression(filename, allelement, genenumber, cellnumber, hdf5):
424 |     gene_seen = allelement
425 | 
426 |     whitelist={}
427 |     o = gzip.open('%s_scTEtmp/o4/%s.bed.gz'%(filename, filename), 'rt')
428 |     for n,l in enumerate(o):
429 |         t = l.strip().split('\t')
430 |         if t[0] not in whitelist:
431 |             whitelist[t[0]] = 0
432 |         whitelist[t[0]] += 1
433 |     o.close()
434 | 
435 |     CRlist = []
436 |     sortcb = sorted(whitelist.items(), key=lambda item:item[1], reverse=True)
437 |     for n,k in enumerate(sortcb):
438 |         if k[1] < genenumber:
439 |             break
440 |         if n >= cellnumber:
441 |             break
442 |         CRlist.append(k[0])
443 |     CRlist = set(CRlist)
444 | 
445 |     res = {}
446 |     genes_oh = gzip.open('%s_scTEtmp/o4/%s.bed.gz' % (filename,filename), 'rt')
447 |     for n, l in enumerate(genes_oh):
448 |         t = l.strip().split('\t')
449 |         if t[0] not in CRlist:
450 |             continue
451 |         if t[0] not in res:
452 |             res[t[0]] = {}
453 |         if t[1] not in res[t[0]]:
454 |             res[t[0]][t[1]] = 0
455 |         res[t[0]][t[1]] += int(t[2])
456 | 
457 |     genes_oh.close()
458 | 
459 |     s=time.time()
460 | 
461 |     # Save out the final file
462 | 
463 |     gene_seen = list(gene_seen) # Do the sort once;
464 |     gene_seen.sort()
465 | 
466 |     #==== save results =====
467 |     if not hdf5: # save as csv
468 |         res_oh = open('%s.csv'%filename, 'w')
469 |         res_oh.write('barcodes,')
470 |         res_oh.write('%s\n' % (','.join([str(i) for i in gene_seen])))
471 | 
472 |         for k in sorted(res):
473 |             l = ["0"] * len(gene_seen) # Avoid all the appends
474 |             for idx, gene in enumerate(gene_seen):
475 |                 if gene in res[k]:
476 |                     l[idx] = str(res[k][gene])
477 |             res_oh.write('%s,%s\n' % (k, ','.join(l)))
478 |         res_oh.close()
479 |     
480 |     else: # save as hdf5
481 |         data = []
482 |         CBs = []
483 |         for k in sorted(res):
484 |             l = ["0"] * len(gene_seen) # Avoid all the appends
485 |             for idx, gene in enumerate(gene_seen):
486 |                 if gene in res[k]:
487 |                     l[idx] = str(res[k][gene])
488 |             data.append(l)
489 |             CBs.append(k)
490 | 
491 |         obs = pd.DataFrame(index = CBs)
492 |         var = pd.DataFrame(index = gene_seen)
493 |         adata = ad.AnnData(np.asarray(data).astype(int),var = var,obs = obs)
494 |         adata.X = scipy.sparse.csr_matrix(adata.X)
495 |         adata.write('%s.h5ad'%filename)
496 |     
497 |     #========================
498 | 
499 | 
500 |     return len(res), genenumber, filename
501 | 
502 | def timediff(timestart, timestop):
503 |     t  = (timestop-timestart)
504 |     time_day = t.days
505 |     s_time = t.seconds
506 |     ms_time = t.microseconds / 1000000
507 |     usedtime = int(s_time + ms_time)
508 |     time_hour = int(usedtime / 60 / 60 )
509 |     time_minute = int((usedtime - time_hour * 3600 ) / 60 )
510 |     time_second =  int(usedtime - time_hour * 3600 - time_minute * 60 )
511 |     retstr = "%dd %dh %dm %ds"  %(time_day, time_hour, time_minute, time_second,)
512 |     return retstr
513 | 


--------------------------------------------------------------------------------
/scTE/miniglbase/README.md:
--------------------------------------------------------------------------------
 1 | # README #
 2 | 
 3 | ### What is glbase3? ###
 4 | 
 5 | This is a staged mini version of glbase.
 6 | 
 7 | You can find the full install here:
 8 | 
 9 | https://github.com/oaxiom/glbase3
10 | 
11 | == License ==
12 | 
13 | glbase is distributed under the MIT license:
14 | {{{
15 |     Copyright (C) 2009-2019 Andrew Hutchins
16 |     
17 |     Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
18 |     
19 |     The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
20 |     
21 |     Except as contained in this notice, the name(s) of the above copyright holders shall not be used in advertising or otherwise to promote the sale, use or other dealings in this Software without prior written authorization.
22 |     
23 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 | }}}
25 | 
26 | 


--------------------------------------------------------------------------------
/scTE/miniglbase/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | Initialise glbase, import all the libraries, set up the environment etc.
 4 | 
 5 | Requires:
 6 | * numpy
 7 | * matplotlib
 8 | * scipy
 9 | * sklearn
10 | * h5py
11 | * networkx
12 | """
13 | 
14 | import sys, os
15 | 
16 | #-----------------------------------------------------------------------
17 | # Load all of the global configuration options.
18 | try:
19 |     from . import config
20 | except:
21 |     print("Error: Fatal - glbase3 is not installed correctly, cannot find my own libraries")
22 |     print("       Is the python 'sys.path' correct?")
23 |     sys.exit() # no raise if I can't get errors, it's surely a fatal installation problem.
24 | 
25 | # ----------------------------------------------------------------------
26 | # Test for availability of the core non-standard libs.
27 | # These need to be available as the subsequent load/checking is weak/non-existent.
28 | 
29 | try:
30 |     import numpy
31 |     config.NUMPY_AVAIL = True
32 | except Exception:
33 |     raise LibraryNotFoundError("Fatal - Numpy is not available or not installed")
34 | 
35 | try:
36 |     import scipy
37 |     config.SCIPY_AVAIL = True
38 | except Exception:
39 |     raise LibraryNotFoundError("Fatal - Scipy is not available or not installed")
40 | 
41 | # ----------------------------------------------------------------------
42 | # Now import the rest of my libraries - assumes here they are available.
43 | # If I can get config and errors then these are probably available too.
44 | 
45 | from .utils import glload
46 | from .location import location
47 | from .genelist import genelist
48 | 
49 | # export all of the libraries, methods and helpers.
50 | __all__ = ["genelist",
51 |             'config',
52 |             "location",
53 |             "glload",
54 |             ]
55 | 


--------------------------------------------------------------------------------
/scTE/miniglbase/base_genelist.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import copy, pickle, re
  3 | from shlex import split as shlexsplit
  4 | 
  5 | from . import config
  6 | from .location import location
  7 | 
  8 | class _base_genelist:
  9 |     def __init__(self):
 10 |         """
 11 |         (Internal)
 12 |         This is the base derived class for all genelists.
 13 |         It contains methods available to all implementations of genelist.
 14 |         """
 15 |         self.name = None
 16 |         self.linearData = None
 17 | 
 18 |     def __repr__(self):
 19 |         return("<base genelist class>")
 20 | 
 21 |     def __in__(self, key):
 22 |         """
 23 |         (Override)
 24 | 
 25 |         Confer:
 26 |         if "key" in genelist:
 27 |         """
 28 |         return(key in list(self.keys()))
 29 | 
 30 |     def __bool__(self):
 31 |         """
 32 |         Fixes:
 33 |         if genelist: # contains something
 34 |             True
 35 | 
 36 |         and fixes:
 37 | 
 38 |         len(genelist) = 0
 39 |         if genelist: # Would pass even if the genelist is empty
 40 |             False
 41 | 
 42 |         """
 43 |         return(len(self) > 0)
 44 | 
 45 |     def __shallowcopy__(self):
 46 |         raise Exception("__shallowcopy__() is NOT supposrted for genelists, use gl.deepcopy() or gl.shallowcopy()")
 47 | 
 48 |     def __deepcopy__(self, fake_arg):
 49 |         raise Exception("__deepcopy__() is NOT supported for genelists, use gl.deepcopy() or gl.shallowcopy()")
 50 | 
 51 |     def deepcopy(self):
 52 |         """
 53 |         Confer copy to mean a deepcopy as opposed to a shallowcopy.
 54 | 
 55 |         This is required as genelists are compound lists.
 56 |         """
 57 |         return(pickle.loads(pickle.dumps(self, -1))) # This is 2-3x faster and presumably uses less memory
 58 | 
 59 |     def shallowcopy(self):
 60 |         """
 61 |         (New)
 62 | 
 63 |         Some weird behaviour here, I know, this is so I can still get access to
 64 |         the shallow copy mechanism even though 90% of the operations are copies.
 65 |         """
 66 |         return(copy.copy(self)) # But doesnt this just call __copy__() anyway?
 67 | 
 68 |     def __len__(self):
 69 |         """
 70 |         (Override)
 71 |         get the length of the list
 72 |         """
 73 |         return(len(self.linearData))
 74 | 
 75 |     def __int__(self):
 76 |         """
 77 |         (Override)
 78 |         get the length of the list
 79 |         NOTE: It's possible this is a bug/feature.
 80 |         I don't remove it at the moment as I'm not sure if it is used anywhere.
 81 | 
 82 |         """
 83 |         return(len(self.linearData))
 84 | 
 85 |     def __iter__(self):
 86 |         """
 87 |         (Override)
 88 |         make the geneList behave like a normal iterator (list)
 89 |         """
 90 |         for n in self.linearData:
 91 |             yield n
 92 | 
 93 |     def __getitem__(self, index):
 94 |         """
 95 |         (Override)
 96 |         confers a = geneList[0] behaviour
 97 | 
 98 |         This is a very slow way to access the data, and may be a little inconsistent in the things
 99 |         it returns.
100 | 
101 |         NOTE:
102 |         a = genelist[0] # returns a single dict
103 |         a = genelist[0:10] # returns a new 10 item normal python list.
104 |         a = genelist["name"] returns a python list containing a vertical slice of all of the "name" keys
105 | 
106 |         """
107 |         newl = False
108 |         if isinstance(index, int):
109 |             # this should return a single dictionary.
110 |             return(self.linearData[index])
111 |         elif isinstance(index, str):
112 |             # returns all labels with that item.
113 |             return(self._findAllLabelsByKey(index))
114 |         elif isinstance(index, slice):
115 |             # returns a new genelist corresponding to the slice.
116 |             newl = self.shallowcopy()
117 |             newl.linearData = utils.qdeepcopy(self.linearData[index]) # separate the data so it can be modified.
118 |             newl._optimiseData()
119 |         return(newl) # deep copy the slice.
120 | 
121 |     def __setitem__(self, index, *args):
122 |         """
123 |         (Override)
124 |         Block key editing.
125 |         """
126 |         raise AssertionError
127 | 
128 |     def __hash__(self):
129 |         """
130 |         (Override)
131 | 
132 |         compute a sensible hash value
133 |         """
134 |         try:
135 |             return(hash(self.name + str(self[0]) + str(self[-1]) + str(len(self)))) # hash data for comparison.
136 |         except Exception:
137 |             try:
138 |                 return(hash(self.name + str(self[0]) + str(self[-1]))) # len() probably not available (delayedlist?).
139 |             except Exception: # I bet the list is empty.
140 |                 return(hash(self.name))
141 | 
142 |     def __add__(self, gene_list):
143 |         """
144 |         (Override)
145 |         confer append like behaviour: c = a + b
146 |         keeps duplicates (just concatenate's lists)
147 |         """
148 |         mkeys = self._collectIdenticalKeys(gene_list)
149 |         if not mkeys: # unable to match.
150 |             config.log.warning("No matching keys, the resulting list would be meaningless")
151 |             return(False)
152 |         newl = self.deepcopy()
153 |         newl.linearData.extend(copy.deepcopy(gene_list.linearData))
154 |         newl._optimiseData()
155 |         return(newl)
156 | 
157 |     def __eq__(self, gene_list):
158 |         """
159 |         (Internal)
160 |         Are the lists equivalent?
161 |         lists now, must only have one identical key.
162 | 
163 |         This is just testing the keys...
164 |         Wrong...
165 |         """
166 |         # check the hash's first to see if they are identical.
167 |         # This is diabled as it can be very slow.
168 |         #if self.__hash__() == gene_list.__hash__():
169 |         #    return(True)
170 | 
171 |         for key in self.linearData[0]:
172 |             if key in gene_list.linearData[0]:
173 |                 return(True) # just one key in common required.
174 |         return(False)
175 | 
176 |     def __ne__(self, gene_list):
177 |         """
178 |         (Internal)
179 |         Are the lists equivalent?
180 |         ie do they have the same keys?
181 |         """
182 |         return(not self.__eq__(gene_list))
183 | 
184 |     def keys(self):
185 |         """
186 |         return a list of all the valid keys for this geneList
187 |         """
188 |         return([key for key in self.linearData[0]]) # Not exhaustive
189 | 
190 |     def _guessDataType(self, value):
191 |         """
192 |         (Internal)
193 | 
194 |         Take a guess at the most reasonable datatype to store value as.
195 |         returns the resulting data type based on a list of logical cooercions
196 |         (explain as I fail each cooercion).
197 |         Used internally in _loadCSV()
198 |         I expect this will get larger and larger with new datatypes, so it's here as
199 |         as a separate function.
200 | 
201 |         Datatype coercion preference:
202 |         float > list > int > location > string
203 |         """
204 | 
205 |         try: # see if the element is a float()
206 |             if "." in value: # if no decimal point, prefer to save as a int.
207 |                 return(float(value))
208 |             else:
209 |                 raise ValueError
210 |         except ValueError:
211 |             try:
212 |                 # Potential error here if it is a list of strings?
213 |                 if '[' in value and ']' in value and ',' in value and '.' in value: # Probably a Python list of floats
214 |                     return([float(i) for i in value.strip(']').strip('[').split(',')])
215 |                 elif '[' in value and ']' in value and ',' in value: # Probably a Python list of ints
216 |                     return([int(i) for i in value.strip(']').strip('[').split(',')])
217 |                 else:
218 |                     raise ValueError
219 |             except ValueError:
220 |                 try: # see if it's actually an int?
221 |                     return(int(value))
222 |                 except ValueError:
223 |                     try: # see if I can cooerce it into a location:
224 |                         return(location(loc=value))
225 |                     except (TypeError, IndexError, AttributeError, AssertionError, ValueError): # this is not working, just store it as a string
226 |                         return(str(value).strip())
227 |         return("") # return an empty datatype.
228 |         # I think it is possible to get here. If the exception at int() or float() returns something other than a
229 |         # ValueError (Unlikely, Impossible?)
230 | 
231 |     def _processKey(self, format, column):
232 |         """
233 |         (Internal)
234 |         the inner part of _loadCSV() to determine what to do with the key.
235 |         Better in here too for security.
236 |         """
237 | 
238 |         d = {}
239 |         for key in format:
240 |             if isinstance(format[key], str) and "location" in format[key]:
241 |                 # locations are very common, add support for them out of the box:
242 |                 d[key] = eval(format[key])
243 |             else:
244 |                 d[key] = self._guessDataType(column[format[key]])
245 | 
246 |         return(d)
247 | 
248 |     def save(self, filename=None, compressed=False):
249 |         """
250 |         **Purpose**
251 | 
252 |             Save the genelist as a binary representation.
253 |             This is guaranteed to be available for all geneList representations, with
254 |             the only exception being the delayedlists. As that wouldn't
255 |             make any sense as delayedlists are not copied into memory.
256 | 
257 |             You can use this method to cache the file. It's particularly useful for large files
258 |             that get processed once but are then used a lot.
259 | 
260 |             loading the list back into memory is relatively quick.
261 | 
262 |             list = glload("path/to/filename.glb")
263 | 
264 |             I generally used extension is glb. Although you can use
265 |             whatever you like.
266 | 
267 |         **Arguments**
268 | 
269 |             filename
270 |                 filename (and path, if you like) to save the file to
271 | 
272 |             compressed (Optional, default=False)
273 |                 use compression (not currently implemented)
274 | 
275 |         **Result**
276 | 
277 |             returns None
278 |             Saves a binary representation of the geneList
279 | 
280 |         """
281 |         assert filename, "no filename specified"
282 | 
283 |         oh = open(filename, "wb")
284 |         if compressed:
285 |             config.log.warning("compression not currently implemented, saving anyway")
286 |             pickle.dump(self, oh, -1)
287 |         else:
288 |             pickle.dump(self, oh, -1)
289 |         oh.close()
290 |         config.log.info("Saved binary version of list: '%s'" % filename)
291 | 
292 |     def from_pandas(self, pandas_data_frame):
293 |         """
294 |         **Purpose**
295 | 
296 |             Convert a pandas dataFrame to a genelist
297 | 
298 |             NOTE: This is an INPLACE method that will REPLACE any exisiting data
299 |             in the
300 | 
301 |         **Arguments**
302 | 
303 |             pandas_data_frame (Required)
304 |                 The pandas data frame to convert
305 | 
306 |         **Result**
307 |             None
308 |             The object is populated by
309 | 
310 |         """
311 |         if len(self) > 0:
312 |             config.log.warning('genelist.from_pandas() will overwrite the existing data in the genelist')
313 | 
314 |         newl = []
315 |         key_names = pandas_data_frame.columns
316 |         for index, row in pandas_data_frame.iterrows():
317 |             newitem = {}
318 |             for k, item in zip(key_names, row):
319 |                 newitem[k] = item
320 |             newl.append(newitem)
321 |         self.linearData = newl
322 |         self._optimiseData()
323 | 
324 |         config.log.info("genelist.from_pandas() imported dataFrame")
325 | 


--------------------------------------------------------------------------------
/scTE/miniglbase/config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | config.py
 3 | 
 4 | config must be imported before any other glbase library.
 5 | 
 6 | """
 7 | 
 8 | import logging
 9 | 
10 | # -------------- Versioning data
11 | GLBASE_VERSION = "1.1105"
12 | 
13 | # -------------- General options
14 | 
15 | SILENT = False # set this to True to silence all glbase output. Only works at startup
16 | DEBUG = True
17 | do_logging = True
18 | 
19 | # flags for the availability of libraries
20 | MATPLOTLIB_AVAIL = False # required
21 | NUMPY_AVAIL = False # required
22 | SCIPY_AVAIL = False # required
23 | SKLEARN_AVAIL = False # required
24 | H5PY_AVAIL = False # Optional.
25 | NETWORKX_AVAIL = False # optional
26 | PYDOT_AVAIL = False # optional
27 | NUMEXPR_AVAIL = False # Optional
28 | PYGRAPHVIZ_AVAIL = False # Optional
29 | 
30 | # Some simple options for printing genelists
31 | NUM_ITEMS_TO_PRINT = 3 # number of items to print by default.
32 | PRINT_LAST_ITEM = True
33 | 
34 | # size of buckets for collide() and overlap()
35 | # If this is changed then glload will not work correctly.
36 | bucket_size = 10000 # in bp - tested, seems a reasonable choice.
37 | 
38 | # -------------- set up the logger here.
39 | logging.basicConfig(level=logging.DEBUG,
40 |                     format='%(levelname)-8s: %(message)s',
41 |                     datefmt='%m-%d %H:%M'),
42 | 
43 | 
44 | log = logging.getLogger('glbase3')
45 | log.setLevel(logging.INFO)
46 | 


--------------------------------------------------------------------------------
/scTE/miniglbase/location.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | location.py
  4 | 
  5 | part of glbase.
  6 | 
  7 | This class is an internal class that implements a more convenient way to manipulate
  8 | genomic coordiantes.
  9 | 
 10 | TODO:
 11 | . add a 'in' code clause e.g.:
 12 |     if 1000 in location: (see if 1000 > left & < right)
 13 |     if a_location in b_location: (exectute a collide())
 14 | 
 15 | """
 16 | 
 17 | import copy, pickle
 18 | 
 19 | class location:
 20 |     def __init__(self, loc=None, chr=None, left=None, right=None):
 21 |         if isinstance(loc, location):
 22 |             # It's actually already a loc.
 23 |             # I want to copy it and leave.
 24 |             self.loc = copy.copy(loc.loc)
 25 |         else:
 26 |             if loc:
 27 |                 s = loc.lower().replace(",", "") # ucsc includes commas, remove them so you can cut and paste
 28 |                 t = s.split(":")
 29 |                 self.loc = {"chr": t[0].strip("chr").rstrip().upper(), "left":int(t[1].split("-")[0]), "right":int(t[1].split("-")[1])}
 30 |             else:
 31 |                 self.loc = {"chr": str(chr).strip("chr").rstrip().upper(), "left": int(left), "right": int(right)}
 32 |         self.__update() # make sure the locstring is valid:
 33 | 
 34 |     def __eq__(self, other):
 35 |         if other:
 36 |             if isinstance(other, str):
 37 |                 return(str(self) == str(other.replace(",", ""))) # use string comparison.
 38 | 
 39 |             # use a faster ? dict comparison, or throw an exception, as this item probably not a <location>
 40 |             if self.loc["chr"] == other.loc["chr"]:
 41 |                 if self.loc["left"] == other.loc["left"]:
 42 |                     if self.loc["right"] == other.loc["right"]:
 43 |                         return(True)
 44 |         return(False)
 45 | 
 46 |     def __lt__(self, other): # deprecated in Python3
 47 |         # Make locations sortable
 48 |         if self.loc['chr'] < other.loc['chr']:
 49 |             return True
 50 |         elif self.loc['chr'] == other.loc['chr']:
 51 |             if self.loc['left'] < other.loc['left']:
 52 |                 return True
 53 |             elif self.loc['left'] == other.loc['left']: # For ties
 54 |                 return False
 55 |             return False
 56 |         #self.loc['chr'] > other.loc['chr']:
 57 |         return False
 58 | 
 59 |     def __hash__(self):
 60 |         return(hash(self._loc_string))
 61 |     
 62 |     def __deepcopy__(self, memo):
 63 |         return(pickle.loads(pickle.dumps(self, -1))) # This is 2-3x faster and presumably uses less memory
 64 |     
 65 |     def __bool__(self):
 66 |         return(True)
 67 | 
 68 |     def __repr__(self):
 69 |         return("<location %s>" % (self._loc_string))
 70 | 
 71 |     def __len__(self):
 72 |         # work out the span.
 73 |         return(max([0, self.loc["right"] - self.loc["left"]]))
 74 | 
 75 |     def split(self, value=None):
 76 |         # ignores the 'value' argument completely and returns a three-ple
 77 |         return( (self.loc["chr"], self.loc["left"], self.loc["right"]) )
 78 | 
 79 |     def __update(self):
 80 |         self._loc_string = None
 81 |         try:
 82 |             self._loc_string = "chr%s:%s-%s" % (self.loc["chr"].strip("chr"), self.loc["left"], self.loc["right"])
 83 |         except Exception: # chr possibly sets of strings ... etc.
 84 |             self._loc_string = "chr%s:%s-%s" % (self.loc["chr"], self.loc["left"], self.loc["right"])
 85 |             # I can't import my bunch of errors, as location is used in that module. So I spoof an assert
 86 |             if not self._loc_string: # failed to make a valid string...
 87 |                 raise "Bad location formatting"
 88 | 
 89 |     def __getitem__(self, key):
 90 |         if key == "string":
 91 |             self.__update() # only update when accessed.
 92 |             return(self._loc_string)
 93 |         elif key == "dict":
 94 |             return(self.loc)
 95 |         return(self.loc[key])
 96 | 
 97 |     def __setitem__(self, key, value):
 98 |         self.loc[key] = value
 99 |         self.__update()
100 | 
101 |     def __str__(self):
102 |         return(self._loc_string)
103 | 
104 |     """
105 |     these methods below should copy the location and send a modified version back.
106 |     """
107 |     def expand(self, base_pairs):
108 |         new = copy.deepcopy(self)
109 |         new.loc["left"] -= base_pairs
110 |         new.loc["right"] += base_pairs
111 |         new.__update()
112 |         return(new)
113 | 
114 |     def expandLeft(self, base_pairs):
115 |         new = copy.deepcopy(self)
116 |         new.loc["left"] -= base_pairs
117 |         new.__update()
118 |         return(new)
119 | 
120 |     def expandRight(self, base_pairs):
121 |         new = copy.deepcopy(self)
122 |         new.loc["right"] += base_pairs
123 |         new.__update()
124 |         return(new)
125 | 
126 |     def shrink(self, base_pairs):
127 |         new = copy.deepcopy(self)
128 |         new.loc["left"] += base_pairs
129 |         new.loc["right"] -= base_pairs
130 |         new.__update()
131 |         return(new)
132 | 
133 |     def shrinkLeft(self, base_pairs):
134 |         new = copy.deepcopy(self)
135 |         new.loc["left"] += base_pairs
136 |         new.__update()
137 |         return(new)
138 | 
139 |     def shrinkRight(self, base_pairs):
140 |         new = copy.deepcopy(self)
141 |         new.loc["right"] -= base_pairs
142 |         new.__update()
143 |         return(new)
144 | 
145 |     def pointLeft(self):
146 |         """
147 |         get a new location at the exact left of the coordinate
148 |         """
149 |         new = copy.deepcopy(self)
150 |         new.loc["right"] = new.loc["left"]
151 |         new.__update()
152 |         return(new)
153 |         
154 |     def pointRight(self):
155 |         """
156 |         get a new location at the exact right of the coordinate
157 |         """
158 |         new = copy.deepcopy(self)
159 |         new.loc["left"] = new.loc["right"]
160 |         new.__update()
161 |         return(new)
162 | 
163 |     def pointify(self):
164 |         new = copy.deepcopy(self)
165 |         centre = (self.loc["left"] + self.loc["right"]) // 2
166 |         new.loc = {"chr": self.loc["chr"], "left": centre, "right": centre}
167 |         new.__update()
168 |         return(new)
169 | 
170 |     def collide(self, loc):
171 |         if loc["chr"] != self["chr"]:
172 |             return(False)
173 |         return(self.loc["right"] >= loc.loc["left"] and self.loc["left"] <= loc.loc["right"])
174 | 
175 |     def qcollide(self, loc):
176 |         """
177 |         **Purpose**
178 |             perform a collision with another location object.
179 |             This assumes you have already checked the locations are on the same chromosome.
180 | 
181 |         **Returns**
182 |             True or False
183 |         """
184 |         return(self.loc["right"] >= loc.loc["left"] and self.loc["left"] <= loc.loc["right"]) # nice one-liner
185 | 
186 |     def distance(self, loc):
187 |         """
188 |         **Purpose**
189 |             calculate the distance between two locations.
190 | 
191 |         **Returns**
192 |             an integer indicating the distance, note that
193 |             the chromosomes should be the same or it will raise an
194 |             exception. distance() should not be used as a test for
195 |             overlap. use collide() for that.
196 |         """
197 |         assert self["chr"] == loc["chr"], "chromosomes are not the same, %s vs %s" % (self, loc)
198 |         return(self.qdistance(loc))
199 | 
200 |     def qdistance(self, loc):
201 |         """
202 |         (Internal)
203 |         ignore the assert.
204 |         """
205 |         centreA = (self.loc["left"] + self.loc["right"]) // 2
206 |         centreB = (loc["left"] + loc["right"]) // 2
207 |         return(centreA - centreB)
208 | 
209 |     def __sub__(self, loc):
210 |         """
211 |         **Purpose**
212 |             Allow things like:
213 |                 
214 |             distance = locA - locB
215 |         """
216 |         return(self.distance(loc))
217 | 
218 |     def offset(self, base_pairs):
219 |         """
220 |         get a new location offset from the 5' end by n base pairs
221 |         returns a point location.
222 |         """
223 |         new = copy.deepcopy(self)
224 |         new.loc["left"] += base_pairs
225 |         new.loc["right"] = new.loc["left"]
226 |         new.__update()
227 |         return(new)
228 | 
229 |     def keys(self):
230 |         """
231 |         Get the keys
232 |         """
233 |         return([i for i in self.loc])
234 |         
235 | if __name__ == "__main__":
236 |     import timeit
237 |     
238 |     s = "a = location(loc='chr1:1000-2000').pointify()"
239 |     t = timeit.Timer(s, "from location import location")
240 |     print("%.2f usec/pass" % (1000000 * t.timeit(number=100000)/100000))


--------------------------------------------------------------------------------
/scTE/miniglbase/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utilities
 3 | 
 4 | Various utilities to support the genome scanning scripts.
 5 | 
 6 | MAny of these predate glbase3, but are a little tricky to remove as I am not sure where
 7 | they are used (if at all).
 8 | 
 9 | So excuse the terrible code in places. I will deprecate occasional functions from this.
10 | 
11 | R=[AG], Y=[CT], K=[GT], M=[AC], S=[GC], W=[AT], and the four-fold
12 | degenerate character N=[ATCG]
13 | 3-fold degenerate motifs re not used like the Lander paper.
14 | 
15 | """
16 | 
17 | import sys, os, pickle
18 | 
19 | from . import config
20 | 
21 | def glload(filename):
22 |     """
23 |     **Purpose**
24 |         Load a glbase binary file
25 |         (Actually a Python pickle)
26 | 
27 |     **Arguments**
28 |         filename (Required)
29 |             the filename of the glbase binary file to load.
30 | 
31 |     **Returns**
32 |         The glbase object previously saved as a binary file
33 |     """
34 |     assert os.path.exists(os.path.realpath(filename)), "File '%s' not found" % filename
35 | 
36 |     try:
37 |         oh = open(os.path.realpath(filename), "rb")
38 |         newl = pickle.load(oh)
39 |         oh.close()
40 |     except pickle.UnpicklingError:
41 |         raise BadBinaryFileFormatError(filename)
42 | 
43 |     # Recalculate the _optimiseData for old lists, and new features
44 |     try:
45 |         if newl.qkeyfind:
46 |             pass
47 |         if "loc" in list(newl.keys()) or "tss_loc" in list(newl.keys()): # buckets are only present if a loc key is available.
48 |             if newl.buckets: # added in 0.381, only in objects with tss_loc or loc key.
49 |                 pass
50 |     except Exception:
51 |         config.log.warning("Old glb format, will rebuild buckets and/or qkeyfind, consider resaving")
52 |         newl._optimiseData()
53 | 
54 |     try:
55 |         cons = len(newl._conditions) # expression-like object
56 |         config.log.info("Loaded '%s' binary file with %s items, %s conditions" % (filename, len(newl), cons))
57 |     except AttributeError:
58 |         config.log.info("Loaded '%s' binary file with %s items" % (filename, len(newl)))
59 |     return(newl)
60 | 


--------------------------------------------------------------------------------
/scTE/scatacseq.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | 
  3 | The scATAC-seq data comes as three files, P1, P2 and the barcode, and there is no UMI
  4 | 
  5 | You can just align P1 and P2 with your favourite aligner (we prefer STAR with these settings):
  6 | 
  7 | ****
  8 | teopts=' --outFilterMultimapNmax 100 --winAnchorMultimapNmax 100 --outSAMmultNmax 1 --outSAMtype BAM SortedByCoordinate --twopassMode Basic --outWigType wiggle --outWigNorm RPM'
  9 | opts='--runRNGseed 42 --runThreadN 12 --readFilesCommand zcat '
 10 | 
 11 | genome_mm10='--genomeDir mm10_gencode_vM21_starsolo/SAindex'
 12 | genome_hg38='--genomeDir hg38_gencode_v30_starsolo/SAindex'
 13 | 
 14 | # p1 = read
 15 | # p2 = barcode and UMI
 16 | # Make sure you set the correct genome index;
 17 | STAR $opts $teopts $genome_hg38 --outFileNamePrefix ss.${out} --readFilesIn ${p1} ${p2}
 18 | ****
 19 | 
 20 | This script will then reprocess the BAM file, and put the BARCODE into CR SAM tag and spoof a UMI
 21 | 
 22 | The UMI is generated by incrementing the sequence, so, each UMI is up to 4^14 (26 million).
 23 | I guess there remains a change of a clash, but it should be so rare as to be basically impossible.
 24 | 
 25 | Require pysam
 26 | 
 27 | 
 28 | See also: bin/pack_scatacseq
 29 | 
 30 | '''
 31 | 
 32 | import sys,os
 33 | import gzip
 34 | import argparse
 35 | import logging
 36 | import dbm
 37 | import time
 38 | import random
 39 | 
 40 | try:
 41 |     import pysam
 42 | except ImportError:
 43 |     pass # fail silently
 44 | 
 45 | def generate_mismatches(seq):
 46 |     """
 47 |     **Purpose**
 48 |         Generate all 1 bp mismatches for the sequence
 49 |     """
 50 |     newseqs = []
 51 | 
 52 |     for pos in range(len(seq)):
 53 |         newseqs += list(library([[i] for i in seq[0:pos]] + ["ACGT"] + [[i] for i in seq[pos:-1]]))
 54 | 
 55 |     return set(newseqs)
 56 | 
 57 | def fastq(file_handle):
 58 |     """
 59 |     Generator object to parse a FASTQ file
 60 | 
 61 |     """
 62 |     name = "dummy"
 63 |     while name != "":
 64 |         name = file_handle.readline().strip()
 65 |         seq = file_handle.readline().strip()
 66 |         strand = file_handle.readline().strip()
 67 |         qual = file_handle.readline().strip()
 68 | 
 69 |         yield {"name": name, "strand": strand, "seq": seq, "qual": qual}
 70 |     return
 71 | 
 72 | def library(args):
 73 |     """
 74 |     Sequence generator iterator
 75 | 
 76 |     """
 77 |     if not args:
 78 |         yield ""
 79 |         return
 80 |     for i in args[0]:
 81 |         for tmp in library(args[1:]):
 82 |             yield i + tmp
 83 |     return
 84 | 
 85 | def atacBam2bed(filename, out, CB, UMI, noDup, num_threads):
 86 | 
 87 |     sample=filename.split('/')[-1].replace('.bam','')
 88 | 
 89 |     if sys.platform == 'darwin': # Mac OSX has BSD sed
 90 |         switch = '-E'
 91 |     else:
 92 |         switch = '-r'
 93 | 
 94 |     if not CB:
 95 |         # Put the sample name in the barcode slot
 96 |         if noDup:
 97 |             os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,"%s"}\' | sed %s \'s/^chr//g\' | awk \'!x[$0]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (filename, sample,switch, out, out))
 98 |         else:
 99 |             os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,"%s"}\' | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (filename, sample,switch, out, out))
100 |     else:
101 |         if noDup:
102 |             os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,$7}\'  | sed %s \'s/^chr//g\' | awk \'!x[$0]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (filename, switch, out, out))
103 | #             os.system('bamToBed -i %s  -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$3,$4}\'  | sed %s \'s/^chr//g\' | awk \'!x[$0]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (filename, switch, out, out))
104 |         else:
105 |             os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,$7}\'  | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (filename, switch, out, out))
106 | 
107 | def para_atacBam2bed(filename, CB, out, noDup):
108 |     if not os.path.exists('%ss_scTEtmp/o0'%out):
109 |         os.system('mkdir -p %s_scTEtmp/o0'%out)
110 | 
111 |     sample=filename.split('/')[-1].replace('.bam','')
112 | 
113 |     if sys.platform == 'darwin': # Mac OSX has BSD sed
114 |         switch = '-E'
115 |     else:
116 |         switch = '-r'
117 | 
118 |     if not CB:
119 |         if noDup:
120 |             os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,"%s"}\' | sed %s \'s/^chr//g\' | awk \'!x[$0]++\' | gzip -c > %s_scTEtmp/o0/%s.bed.gz' %(filename, sample, switch, out, sample))
121 |         else:
122 |             os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,"%s"}\' | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o0/%s.bed.gz' %(filename, sample, switch, out, sample))
123 |     else:
124 |         if noDup:
125 | #             os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,$7}\' | sed %s \'s/^chr//g\' | awk \'!x[$0]++\' | gzip -c > %s_scTEtmp/o0/%s.bed.gz' % (filename, switch, out, out))
126 |             os.system('bamToBed -i %s | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$3,$4}\'  | sed %s \'s/^chr//g\' | awk \'!x[$0]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (filename, switch, out, out))
127 |         else:
128 |             os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,$7}\' | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o0/%s.bed.gz' % (filename, switch, out, out))
129 | 
130 | def load_expected_whitelist(filename, logger):
131 |     """
132 |     **Purpose**
133 |         Load the expected whitelist and output a set
134 | 
135 |     """
136 |     expected_whitelist = []
137 |     oh = open(filename, 'rt')
138 |     for line in oh:
139 |         expected_whitelist.append(line.strip())
140 |     oh.close()
141 | 
142 |     expected_whitelist = set(expected_whitelist)
143 | 
144 |     logger.info('Found {0:,} expected barcodes'.format(len(expected_whitelist)))
145 | 
146 |     return expected_whitelist
147 | 
148 | def build_barcode_dict(barcode_filename, save_whitelist=False, expected_whitelist=False,
149 |     gzip_file=True, logger=False, ondisk=True):
150 |     '''
151 |     **Purposse**
152 |         The BAM and the FASTQ are not guaranteed to be in the same order, so I need to make a look up for
153 |         the read ID and the barcode
154 | 
155 |     **Arguments**
156 |         barcode_filename (Required)
157 | 
158 |         save_whitelist (Optional, default=False)
159 |             save out the whitelist of barcodes (i.e. the ones actually observed)\
160 | 
161 |             TODO: This should be checked against the expected whitelist, and 1bp Hamming corrected
162 | 
163 |     **Returns**
164 |         A dict mapping <readid>: <barcode>
165 |     '''
166 |     assert barcode_filename, 'barcode_filename is required'
167 | 
168 |     if expected_whitelist:
169 |         logger.info('Checking against the expected whitelist and correcting barcodes')
170 |     else:
171 |         logger.warning('Not checking the barcodes against an expected whitelist, barcodes will not be corrected')
172 | 
173 |     bad_barcodes = 0
174 |     rescued_barcodes = 0
175 | 
176 |     if ondisk:
177 |         tmpfilename = './tpm_{0:}_{1:}_{2:}.dbm'.format(barcode_filename, time.time(), random.randint(0, 10000))
178 |         barcode_lookup = dbm.open(tmpfilename, 'n')
179 |     else:
180 |         tmpfilename = None
181 |         barcode_lookup = {}
182 | 
183 |     if gzip_file:
184 |         oh = gzip.open(barcode_filename, 'rt')
185 |     else:
186 |         oh = open(barcode_filename, 'rt')
187 | 
188 |     for idx, fq in enumerate(fastq(oh)):
189 |         barcode = fq['seq']
190 |         if 'N' in barcode: # Discard this barcode
191 |             bad_barcodes += 1
192 |             continue
193 | 
194 |         if expected_whitelist and barcode not in expected_whitelist:
195 |             # barcode not in the whitelist
196 |             # see if we can resuce it:
197 |             rescued = False
198 |             for mm in generate_mismatches(barcode):
199 |                 if mm in expected_whitelist:
200 |                     barcode = mm # Corrected
201 |                     rescued_barcodes += 1
202 |                     rescued = True
203 |                     break
204 |             if not rescued:
205 |                 bad_barcodes += 1 # unrecoverable
206 |                 continue
207 | 
208 |         name = fq['name'].split(' ')[0].lstrip('@') # Any other types seen?
209 |         barcode_lookup[name] = barcode
210 | 
211 |         if (idx+1) % 10000000 == 0:
212 |             logger.info('Processed: {:,} barcode reads'.format(idx+1))
213 |     oh.close()
214 | 
215 |     logger.info('Processed: {:,} barcode reads from the FASTQ'.format(idx+1))
216 |     logger.info('Bad reads with no barcode {:,} reads'.format(bad_barcodes))
217 |     logger.info('Rescued {:,} reads'.format(rescued_barcodes))
218 |     logger.info('Found {:,} valid reads'.format(len(set(barcode_lookup.keys())), ))
219 |     logger.info('Found {:,} valid barcodes'.format(len(set(barcode_lookup.values())), ))
220 | 
221 |     if save_whitelist:
222 |         logger.info('Saved whitelist: {0}'.format(save_whitelist))
223 |         oh = open(save_whitelist, 'wt')
224 |         for k in sorted(set(barcode_lookup.values())):
225 |             oh.write('%s\n' % (k))
226 | 
227 |     oh.close()
228 | 
229 |     return barcode_lookup, expected_whitelist, tmpfilename
230 | 
231 | def parse_bam(infile, barcode_lookup, outfile, barcode_corrector, logger):
232 |     """
233 |     **Purpose**
234 |         Parse the BAM file and insert the CR: and YR: tags
235 |     """
236 |     inbam = pysam.AlignmentFile(infile[0], 'rb')
237 |     outfile = pysam.AlignmentFile(outfile, 'wb', template=inbam)
238 | 
239 |     #umi_iterator = library(["ACGT"] * 14)
240 | 
241 |     not_paired = 0 # unpaired ATAC
242 |     no_matching_barcode = 0 # No matching read:barcode pair
243 |     corrected_barcodes = 0
244 |     pairs_too_far_apart = 0
245 | 
246 |     quick_lookup = {}
247 | 
248 |     for idx, read in enumerate(inbam):
249 |         if (idx+1) % 10000000 == 0:
250 |             logger.info('Processed: {:,} reads'.format(idx+1))
251 |             #break
252 | 
253 |         if not read.is_paired:
254 |             not_paired += 1
255 |             continue
256 | 
257 |         if read.query_alignment_length > 1000:
258 |             pairs_too_far_apart += 1
259 |             continue
260 | 
261 |         # UMI iterator
262 |         #try:
263 |         #    umi = umi_iterator.__next__()
264 |         #except StopIteration:
265 |         #    umi_iterator = library(["ACGT"] * 14)
266 | 
267 |         # Add the barcode:
268 |         # See if the read is in the lookup:
269 |         if read.query_name in barcode_lookup:
270 |             read.set_tags([('CR:Z', barcode_lookup[read.query_name]),])
271 |         else:
272 |             no_matching_barcode += 1
273 |             continue
274 | 
275 |         # The BAM file is not garunteed to be in order, but the pairs should be pretty close, so I just need to check for the other pair on a simple lookup list
276 |         # and only write out the pairs once I got two
277 |         if read.query_name in quick_lookup: # I found it's pair
278 |             outfile.write(read)
279 |             outfile.write(quick_lookup[read.query_name])
280 |             del quick_lookup[read.query_name]
281 |         else:
282 |             # no pair, store it for later
283 |             quick_lookup[read.query_name] = read
284 | 
285 |     inbam.close()
286 |     outfile.close()
287 | 
288 |     logger.info('Processed {:,} reads from the BAM'.format(idx+1))
289 |     logger.info('{:,} reads were unpaired'.format(not_paired+1))
290 |     logger.info('{:,} read pairs were too far apart'.format(pairs_too_far_apart+1))
291 |     logger.info('Matched {0:,} ({1:.1f}%) reads to a barcode'.format(idx - no_matching_barcode, (idx - no_matching_barcode) / idx * 100.0))
292 |     logger.info('Save BAM ouput file: {0}'.format(infile[0]))
293 |     return
294 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | import glob,os
 3 | 
 4 | def readme():
 5 |       with open('README.md',encoding="utf-8") as f:
 6 |         return f.read()
 7 | 
 8 | setup(name='scTE',
 9 |         version='1.0',
10 |         description='Tool for estimating differential enrichment of Transposable Elements and other highly repetitive regions in single-cell data',
11 |         long_description=readme(),
12 |         classifiers=[
13 |         'Programming Language :: Python :: 3',
14 |         'Programming Language :: Python :: 3.6',
15 |         ],
16 |         python_requires=">=3.6",
17 |         keywords='..',
18 |         url='..',
19 |         author='..',
20 |         author_email='he_jiangping@grmh-gdl.cn; andrewh@sustech.edu.cn',
21 |         license='..',
22 |         packages=[
23 |           'scTE',
24 |           'scTE.miniglbase',
25 |         ],
26 |         platforms=[
27 |           'Linux',
28 |           'MacOS'
29 |         ],
30 |         install_requires=[
31 |           'argparse','scipy','pandas',
32 |           'numpy','anndata',
33 |         ],
34 |         include_package_data=True,
35 |         zip_safe=False,
36 |         scripts=[
37 |           'bin/scTE',
38 |           'bin/scTE_build',
39 |           'bin/scTEATAC_build',
40 |           'bin/scTEATAC',
41 |         ]
42 |         )
43 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
1 | 
2 | scTE_build -g mm10 -te Data/TE.bed -gene Data/Gene.gtf -o Data/test -m exclusive
3 | 
4 | scTE -i Data/test.bam -p 12 --min_genes 1 -o out --genome mm10 -x  Data/test.exclusive.idx
5 | 
6 | #scTE_build -g mm10 -te Data/TE.bed -gene Data/Gene.gtf -o Data/test -m nointron
7 | 
8 | 


--------------------------------------------------------------------------------