├── Instruction_for_old_versions
    ├── Install_by_miniconda.md
    ├── Instruction_of_example_projects.md
    ├── README.md
    ├── TSNE_by_APEC.jpg
    ├── cell_quality.jpg
    ├── louvain_cluster_by_APEC.jpg
    ├── mergeAll.RefSeqTSS.jpg
    └── motif_FOS_on_TSNE_by_APEC.jpg
├── License.txt
├── README.md
├── Updates.md
├── code_v1.0.5
    ├── APEC_prepare_steps.sh
    ├── Bias_corrected_deviation.py
    ├── Bias_corrected_deviation.pyc
    ├── cluster_byAccesson.py
    ├── cluster_byMotif.py
    ├── cluster_comparison.py
    ├── generate_UCSCtrack.py
    ├── generate_differential_Accesson.py
    ├── generate_differential_markers.py
    ├── generate_markers_on_plots.py
    ├── generate_superEnhancer.py
    ├── generate_trajectory.py
    ├── prepare_countMatrix.py
    ├── prepare_geneScore.py
    ├── prepare_mapping.py
    ├── prepare_peakCalling.py
    ├── prepare_premappedMatrix.py
    ├── prepare_qualityControl.py
    ├── prepare_trimming.py
    ├── run_monocle.R
    ├── subroutines.py
    └── subroutines.pyc
├── code_v1.0.6
    ├── APEC_prepare_steps.sh
    ├── Bias_corrected_deviation.py
    ├── cluster_byAccesson.py
    ├── cluster_byMotif.py
    ├── cluster_comparison.py
    ├── generate_UCSCtrack.py
    ├── generate_differential_Accesson.py
    ├── generate_differential_markers.py
    ├── generate_gene_score_by_accesson.py
    ├── generate_markers_on_plots.py
    ├── generate_superEnhancer.py
    ├── generate_trajectory.py
    ├── generate_umap.py
    ├── prepare_countMatrix.py
    ├── prepare_geneScore.py
    ├── prepare_mapping.py
    ├── prepare_peakCalling.py
    ├── prepare_premappedMatrix.py
    ├── prepare_qualityControl.py
    ├── prepare_trimming.py
    └── subroutines.py
├── code_v1.1.0
    ├── APEC_prepare_steps.sh
    ├── generate_UCSCtrack.py
    ├── prepare_countMatrix.py
    ├── prepare_mapping.py
    ├── prepare_peakCalling.py
    ├── prepare_qualityControl.py
    ├── prepare_trimming.py
    ├── subroutines.py
    └── subroutines.pyc
├── code_v1.2
    ├── APEC_prepare_steps.sh
    ├── generate_UCSCtrack.py
    ├── prepare_countMatrix.py
    ├── prepare_mapping.py
    ├── prepare_peakCalling.py
    ├── prepare_qualityControl.py
    ├── prepare_trimming.py
    ├── subroutines.py
    └── subroutines.pyc
├── examples
    ├── .gitattributes
    ├── README.md
    ├── project01.tar.gz
    ├── project02.tar.gz
    ├── project03.tar.gz
    ├── script_for_project01.py
    ├── script_for_project02.py
    ├── script_for_project03.py
    ├── script_python_for_Figure_2c_2d.py
    └── script_python_for_Figure_2f_2g.py
├── images
    ├── TSNE_by_APEC_with_cluster_label.jpg
    ├── TSNE_by_APEC_with_notes_label.jpg
    ├── motif_GATA1_on_trajectory_by_APEC.jpg
    ├── pseudotime_trajectory_with_notes_label.jpg
    └── workflow.jpg
└── reference
    └── README.md


/Instruction_for_old_versions/Install_by_miniconda.md:
--------------------------------------------------------------------------------
 1 | 
 2 | #### download miniconda
 3 | 
 4 |     wget https://repo.anaconda.com/miniconda/Miniconda2-latest-Linux-x86_64.sh
 5 | 
 6 | #### install miniconda
 7 | 
 8 |     bash Miniconda2-latest-Linux-x86_64.sh
 9 | 
10 | #### set channels, all channels must be arranged in the following order
11 | 
12 |     conda config --add channels bioconda
13 |     conda config --add channels r
14 |     conda config --add channels defaults
15 |     conda config --add channels conda-forge
16 | 
17 | #### set a new environment for APEC, i.e. apec_env
18 | 
19 |     conda create -n apec_env python=2.7
20 | 
21 | #### install python packages
22 | 
23 |     conda install -n apec_env bowtie2 samtools bedtools macs2 meme=4.11.2 ucsc-bedgraphtobigwig homer
24 | 
25 |     conda install -n apec_env numpy scipy=1.0.0 pandas numba pysam matplotlib seaborn setuptools networkx python-louvain=0.11
26 |     conda install -n apec_env python-Levenshtein scikit-learn=0.20.0 multicore-tsne umap-learn rpy2=2.8.6
27 |     conda install -n apec_env -c auto multiprocessing
28 | 
29 |     conda install -n apec_env bioconductor-monocle=2.4.0 (It's not recommended to install monocle with conda)
30 |     conda install -n apec_env libiconv r-cluster r-stringr=1.2.0  (required if you use conda to install monocle)
31 | 
32 | **Note**: We found some problems for the R enviroment installed by conda, so we recommend that users do not use conda to install the R environment and Monocle.
33 | 
34 | **Note**: We also found some problems when installing non-python software via conda/bioconda, so we recommend that users do not use conda/bioconda to install bowtie2, samtools, bedtools, macs2, meme (4.11.2) and bedgraghtobigwig.
35 | 
36 | #### activate apec_env
37 | 
38 |     conda activate apec_env
39 | 
40 | #### install genome reference for Homer
41 | 
42 |     perl /path-to-homer/configureHomer.pl -install hg19
43 |     perl /path-to-homer/configureHomer.pl -install mm10
44 | 


--------------------------------------------------------------------------------
/Instruction_for_old_versions/TSNE_by_APEC.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuKunLab/APEC/e15d6c42e05b2deb9c6b89f678905c6ef9878e2b/Instruction_for_old_versions/TSNE_by_APEC.jpg


--------------------------------------------------------------------------------
/Instruction_for_old_versions/cell_quality.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuKunLab/APEC/e15d6c42e05b2deb9c6b89f678905c6ef9878e2b/Instruction_for_old_versions/cell_quality.jpg


--------------------------------------------------------------------------------
/Instruction_for_old_versions/louvain_cluster_by_APEC.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuKunLab/APEC/e15d6c42e05b2deb9c6b89f678905c6ef9878e2b/Instruction_for_old_versions/louvain_cluster_by_APEC.jpg


--------------------------------------------------------------------------------
/Instruction_for_old_versions/mergeAll.RefSeqTSS.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuKunLab/APEC/e15d6c42e05b2deb9c6b89f678905c6ef9878e2b/Instruction_for_old_versions/mergeAll.RefSeqTSS.jpg


--------------------------------------------------------------------------------
/Instruction_for_old_versions/motif_FOS_on_TSNE_by_APEC.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuKunLab/APEC/e15d6c42e05b2deb9c6b89f678905c6ef9878e2b/Instruction_for_old_versions/motif_FOS_on_TSNE_by_APEC.jpg


--------------------------------------------------------------------------------
/License.txt:
--------------------------------------------------------------------------------
1 | Copyright 2019-2020 Qu Lab at UTSC
2 | 
3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
4 | 
5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
6 | 
7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
8 | 
9 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/Updates.md:
--------------------------------------------------------------------------------
 1 | **updates on 2020-03-25**
 2 | 
 3 |     APEC on pypi updated to 1.2.2.
 4 | 
 5 | **updates on 2020-03-25**
 6 | 
 7 |     APEC on pypi updated to 1.2.0.
 8 | 
 9 | **updates on 2019-11-18**
10 | 
11 |     APEC on pypi updated to 1.1.0.11, debugged algorithm for generate.differential_feature().
12 | 
13 | **updates on 2019-11-08**
14 | 
15 |     APEC on pypi updated to 1.1.0.10.
16 |     APEC uses p-value to filter potential super enhancers.
17 | 
18 | **updates on 2019-11-03**
19 | 
20 |     APEC on pypi updated to 1.1.0.9.
21 |     APEC doesn't filter accessons before clustering.
22 | 
23 | **Updates on 2019-07-17**
24 | 
25 |     APEC on pypi updated to 1.1.0.8, optimized algorithm for cell-cell corrlation heatmap.
26 | 
27 | **Updates on 2019-07-02**
28 | 
29 |     APEC on pypi updated to 1.1.0.7, optimized algorithm for trajectory construction and differential gene search.
30 | 
31 | **Updates on 2019-06-23**
32 | 
33 |     APEC on pypi updated to 1.1.0.6, debugged algorithm for clustering.cluster_comparison().
34 | 
35 | **Updates on 2019-06-19**
36 | 
37 |     APEC on pypi updated to 1.1.0.5, debugged algorithm for plot.correlation().
38 | 
39 | **Updates on 2019-06-17**
40 | 
41 |     Fixed several bugs, APEC on pypi updated to 1.1.0.4.
42 | 
43 | **Updates on 2019-06-16**
44 | 
45 |     Fixed several bugs, APEC on pypi updated to 1.1.0.3.
46 | 
47 | **Updates on 2019-06-14**
48 | 
49 |     APEC update to v1.1.0: all important parts (cell clustering, trajectory construction, feature analysis, etc.) are packaged and uploaded to Pypi. Users can install APEC by "pip install APEC". If users already have the fragment count matrix (for example, the CellRanger result of 10X data), please use APEC functions in Ipython, Jupyter-notebook or python script directly. If users want to get fragment count matrix from raw fastq data files, please run "bash APEC_prepare_steps.sh" with proper parameters.
50 | 
51 | **Updates on 2019-06-12**
52 | 
53 |     Using new algorithm to estimate gene score form relevant accessons.
54 | 
55 | **Updates on 2019-06-09**
56 | 
57 |     Cluster_byAccesson.py reuses KNN graph to build accesson matrix, due to the memory requirement for large datesets.
58 | 
59 | **Updates on 2019-05-30**
60 | 
61 |     APEC was updated to version 1.0.6:
62 |         The fragment count matrix is stored in mtx format file, instead of csv format file.
63 |         When building accesson matrix, cluster_byAccesson.py won't use KNN graph anymore, which make clustering result more stable.
64 |         README.md file was updated.
65 | 
66 | **Updates on 2019-05-28**
67 | 
68 |     Fit the problem that cluster_byMotif.py cannot be used on the count matrix of mtx format (i.e. filtered_reads.mtx).
69 |     Also, in cluster_byMotif.py we adopted MulticoreTSNE instead of sklearn.manifold.TSNE
70 | 
71 | **Updates on 2019-05-11**
72 | 
73 |     Update generate_differential_markers.py:
74 |         If '--motif' set to 'no', 'projec/result/deviation_chromVAR.csv' is not required.
75 | 
76 | **Updates on 2019-04-27**
77 | 
78 |     We corrected the error on line 388 of subroutines.py
79 | 
80 | **Updates on 2019-04-25**
81 | 
82 |     We corrected the error on line 123 of cluster_byAccesson.py.
83 | 
84 | **Updates on 2019-04-21**
85 | 
86 |     There is a problem with the TSNE package in scikit-learn when analyzing a large number of cells.
87 |     Now We use mulitcore-tsne in cluster_byAccesson.py to analyze datasets containing more than 10,000 cells.
88 | 


--------------------------------------------------------------------------------
/code_v1.0.5/APEC_prepare_steps.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | #
 3 | #### input parameters defined by users #############################################
 4 | #
 5 | ARGS=`getopt -o hs:g:n:l:p:f: -l help,project:,genome:,np:,logq:,pfrag:,frag: -- "$@"`
 6 | if [ $? != 0 ] ; then echo "Terminating..." >&2 ; exit 1 ; fi
 7 | eval set -- "$ARGS"
 8 | while true ; do
 9 |     case "$1" in
10 |         -h|--help)
11 |            echo "
12 | bash APEC_prepare_steps.sh -s project -g genome_index -n nCPUs -l logq -p pfrag -f frag
13 |      -s/--project:  The project path, which should contain <data> folder before running APEC.
14 |      -g/--genome:   hg19 or mm10.
15 |      -n/--np:       Number of CPU cores.
16 |      -l/--logq:     Threshold for the -log(Q-value) of peaks, used to filter peaks.
17 |      -p/--pfrag:    Threshold of the percentage of fragments in peaks, used to filter cells.
18 |      -f/--frag:     Threshold of the fragment number of each cell, used to filter cells."
19 |            exit 1 ;;
20 |         -s|--project) project="$2" ; shift 2;;
21 |         -g|--genome) genome="$2" ; shift 2;;
22 |         -n|--np) np="$2" ; shift 2;;    
23 |         -l|--logq) logq="$2" ; shift 2;;
24 |         -p|--pfrag) pfrag="$2" ; shift 2;;
25 |         -f|--frag) frag="$2" ; shift 2;;
26 |         --) shift; break ;;
27 |         *) echo "unknown parameter: {$1}" ; exit 1 ;;
28 |     esac
29 | done
30 | #
31 | picard=../reference/picard.jar
32 | ref=$genome
33 | fa="../reference/"$genome"_chr.fa"
34 | index="../reference/"$genome
35 | tss="../reference/"$genome"_refseq_genes_TSS.txt"
36 | if [[ $genome == "hg19" ]]; then
37 |     blist=../reference/hg19_blacklist.JDB.bed
38 | elif [[ $genome == "mm10" ]]; then
39 |     blist=../reference/mm10_blacklist.BIN.bed
40 | fi
41 | gtf="../reference/"$genome"_RefSeq_genes.gtf"
42 | np=$np
43 | logq=$logq
44 | pfrag=$pfrag
45 | frag=$frag
46 | #
47 | #
48 | #
49 | #### processes to prepare raw data ###########
50 | #
51 | python prepare_trimming.py -s $project --np $np
52 | #
53 | python prepare_mapping.py -s $project --index $index --picard $picard --tss $tss --np $np
54 | #
55 | python prepare_peakCalling.py -s $project --blist $blist --fa $fa --tss $tss --ref $ref --logq $logq
56 | #
57 | python prepare_countMatrix.py -s $project --fa $fa --np $np
58 | #
59 | python prepare_qualityControl.py -s $project --pfrag $pfrag --lib $frag
60 | #
61 | python prepare_geneScore.py -s $project --gtf $gtf
62 | #
63 | #
64 | #
65 | #
66 | 


--------------------------------------------------------------------------------
/code_v1.0.5/Bias_corrected_deviation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | import numpy
  4 | import pandas
  5 | #import numba
  6 | from multiprocessing import Pool
  7 | import scipy.sparse
  8 | #
  9 | #
 10 | global GC_bias, peak_reads, nStep, sd
 11 | #
 12 | #
 13 | def mahalanobis_transform(aMatrix):
 14 |     (nSample, nFeature) = aMatrix.shape
 15 |     am_sum = aMatrix.sum(axis=0)
 16 |     ajaMatrix = numpy.zeros((nFeature, nFeature))
 17 |     for i in range(0, nFeature):
 18 |         for j in range(0, nFeature):
 19 |             ajaMatrix[i,j] = am_sum[i] * am_sum[j]
 20 | #    print ajaMatrix.sum(), aMatrix.sum()
 21 |     SaMatrix = numpy.dot(aMatrix.T, aMatrix) / float(nSample) - ajaMatrix / float(nSample**2)
 22 | #    print SaMatrix.sum(), nSample
 23 |     value, vector = numpy.linalg.eig(SaMatrix)
 24 |     value_inverseRoot = numpy.diag(-abs(value)**0.5)
 25 |     Sa_inverseRoot = numpy.dot(numpy.dot(vector, value_inverseRoot), vector.T)
 26 |     aMatrix_ave = aMatrix.mean(axis=0)
 27 |     aMatrix_ave = numpy.array([aMatrix_ave for i in range(0, nSample)])
 28 |     zMatrix = numpy.dot(Sa_inverseRoot, (aMatrix.T - aMatrix_ave.T))
 29 |     return zMatrix.T
 30 | #
 31 | #
 32 | #@numba.jit()
 33 | def single_sampling(par):
 34 |     GC_bias, peak_reads, nStep, sd, iIter = par[0], par[1], par[2], par[3], par[4]
 35 |     numpy.random.seed(12345+iIter)
 36 |     bias_step = (GC_bias.max() - GC_bias.min()) / float(nStep)
 37 |     read_step = (peak_reads.max() - peak_reads.min()) / float(nStep)
 38 |     sample = numpy.zeros(len(GC_bias),dtype=numpy.int)
 39 |     for ibias,bias_i in enumerate(GC_bias):
 40 |         bias_iIndex = int((bias_i - GC_bias.min()) // bias_step)
 41 |         read_iIndex = int((peak_reads[ibias] - peak_reads.min()) // read_step)
 42 |         bias_iIndex = min(nStep-1, max(0, bias_iIndex))
 43 |         read_iIndex = min(nStep-1, max(0, read_iIndex))
 44 |         peaks_inGrid = numpy.array([])
 45 |         ncount = 0
 46 |         while (len(peaks_inGrid)<=0) & (ncount<1000):
 47 |             ncount += 1
 48 |             bias_jIndex = bias_iIndex + int(numpy.rint(numpy.random.randn()*sd))
 49 |             read_jIndex = read_iIndex + int(numpy.rint(numpy.random.randn()*sd))
 50 |             while (bias_jIndex<0)|(bias_jIndex>=nStep):
 51 |                 bias_jIndex = bias_iIndex + int(numpy.rint(numpy.random.randn()*sd))
 52 |             while (read_jIndex<0)|(read_jIndex>=nStep):
 53 |                 read_jIndex = read_iIndex + int(numpy.rint(numpy.random.randn()*sd))
 54 |             bias_jStart = GC_bias.min() + bias_jIndex * bias_step
 55 |             read_jStart = peak_reads.min() + read_jIndex * read_step
 56 |             bias_jBin = numpy.where((bias_jStart<=GC_bias)&(GC_bias<bias_jStart+bias_step))[0]
 57 |             read_jBin = numpy.where((read_jStart<=peak_reads)&(peak_reads<read_jStart+read_step))[0]
 58 |             peaks_inGrid = numpy.intersect1d(bias_jBin, read_jBin)
 59 |         if ncount<1000:
 60 |             sample[ibias] = numpy.random.choice(peaks_inGrid)
 61 |         else:
 62 |             sample[ibias] = ibias
 63 |     print 'permuted sampling ', iIter, ' done!'
 64 |     return sample
 65 | #
 66 | #
 67 | def batch_sampling(GC_bias, peak_reads, nStep, sd, nIteration, np):
 68 |     matrix = numpy.vstack((GC_bias, peak_reads)).T
 69 |     matrix = mahalanobis_transform(matrix)
 70 |     GC_bias, peak_reads = matrix.T[0,:], matrix.T[1,:]
 71 |     kIterations = numpy.arange(0, nIteration, 1, dtype=int)
 72 |     parameters = []
 73 |     for iIter in kIterations:
 74 |         parameters.append([GC_bias, peak_reads, nStep, sd, iIter])
 75 | #
 76 | #    samples = []
 77 | #    for i in kIterations:
 78 | #        par = [GC_bias, peak_reads, nStep, sd, i]
 79 | #        sample = single_sampling(par)
 80 | #        samples.append(sample)
 81 |     pool = Pool(np)
 82 |     samples = pool.map(single_sampling, parameters)
 83 |     pool.close()
 84 |     pool.join()
 85 |     return numpy.array(samples)
 86 | #
 87 | #
 88 | #@numba.jit(nopython=True)
 89 | def expected_matrix(reads, TFmotif, GC_bias):
 90 |     (nCell, nPeak) = reads.shape   # X_matrix
 91 |     (nTF, nPeak) = TFmotif.shape   # M_matrix
 92 |     E_matrix = numpy.zeros(reads.shape)
 93 |     reads_sum = reads.sum()
 94 |     for iCell in range(0, nCell):
 95 |         reads_iCell = reads[iCell, :].sum()
 96 |         for jPeak in range(0, nPeak):
 97 |             E_matrix[iCell, jPeak] = reads[:, jPeak].sum() * reads_iCell / reads_sum
 98 |     return E_matrix
 99 | #
100 | #
101 | #
102 | def deviation(MM, BB, XX, EE):
103 |     MM = numpy.dot(MM, BB.T)
104 |     YY = numpy.dot(MM, XX.T) - numpy.dot(MM, EE.T)
105 |     denom = numpy.dot(MM, EE.T)
106 |     YY = YY / denom
107 |     return YY
108 | #
109 | #
110 | def raw_deviation(TFmotif, reads, expected):
111 |     (nCell, nPeak) = reads.shape
112 |     (nTF, nPeak) = TFmotif.shape
113 |     B_matrix = numpy.diag(numpy.ones(nPeak))
114 |     raw_dev = deviation(TFmotif, B_matrix, reads, expected)
115 |     return raw_dev
116 | #
117 | #
118 | #
119 | #
120 | #
121 | #
122 | #
123 | 


--------------------------------------------------------------------------------
/code_v1.0.5/Bias_corrected_deviation.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuKunLab/APEC/e15d6c42e05b2deb9c6b89f678905c6ef9878e2b/code_v1.0.5/Bias_corrected_deviation.pyc


--------------------------------------------------------------------------------
/code_v1.0.5/cluster_byMotif.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import warnings
  3 | warnings.filterwarnings("ignore")
  4 | #
  5 | import numpy
  6 | import subroutines
  7 | import scipy.io
  8 | import Bias_corrected_deviation
  9 | import pandas
 10 | import sys
 11 | import os
 12 | from optparse import OptionParser
 13 | from multiprocessing import Pool
 14 | import matplotlib
 15 | matplotlib.use('Agg')
 16 | import matplotlib.pyplot as plt
 17 | import seaborn
 18 | from MulticoreTSNE import MulticoreTSNE as McTSNE
 19 | from sklearn import cluster
 20 | from sklearn.neighbors import kneighbors_graph
 21 | import time
 22 | #
 23 | #
 24 | global TFmotif, samples, reads, expected, TFnames, cell_names, GC_bias
 25 | #
 26 | opts = OptionParser()
 27 | usage = "Cluster by ChromVAR\nusage: %prog -s project --np 8"
 28 | opts = OptionParser(usage=usage, version="%prog 1.0.5")
 29 | opts.add_option("-s", help="The project folder.")
 30 | opts.add_option("--format", default='csv', help="=csv or mtx; read fragment count matrix from csv or mtx file, default=csv.")
 31 | opts.add_option("--ns", default=50, help="Number of permuted samplings, default=50")
 32 | opts.add_option("--np", default=1, help="CPU cores used for samplings, default=1")
 33 | opts.add_option("--nc", default=0, help="Number of cell clusters, default=0, i.e. predicted by Louvain algorithm")
 34 | opts.add_option("--hc", default='no', help="Run hierarchical clustering or not, default=no.")
 35 | opts.add_option("--rs", default=1, help="Random state of tSNE analysis, default=1")
 36 | options, arguments = opts.parse_args()
 37 | #
 38 | #
 39 | if not os.path.exists(options.s+'/result'): os.popen('mkdir ' + options.s+'/result')
 40 | if not os.path.exists(options.s+'/figure'): os.popen('mkdir ' + options.s+'/figure')
 41 | #
 42 | #
 43 | def initiation(options):
 44 |     if options.format=='mtx':
 45 |         reads = scipy.io.mmread(options.s+'/matrix/filtered_reads.mtx')
 46 |         reads = scipy.sparse.csr_matrix(reads)
 47 |         reads = reads.A + 0.0001
 48 |         cells = pandas.read_csv(options.s+'/matrix/filtered_cells.csv', sep='\t', index_col=0,
 49 |                    engine='c', na_filter=False, low_memory=False)
 50 |         cell_names = cells.index.values
 51 |     else:
 52 |         reads_df = pandas.read_csv(options.s+'/matrix/filtered_reads.csv', sep=',', index_col=0,
 53 |                    engine='c', na_filter=False, low_memory=False)
 54 |         reads, cell_names = reads_df.values + 0.0001, reads_df.index.values
 55 |     TFmotif_df = pandas.read_csv(options.s+'/matrix/motif_filtered.csv', sep=',', index_col=0)
 56 |     TFmotif_origin = TFmotif_df.values.T
 57 |     TFnames = TFmotif_df.columns.values
 58 |     print 'read-counts matrix:', reads.shape
 59 |     print 'TFmotif matrix:', TFmotif_origin.shape
 60 |     GC_bias = numpy.array([float(x.split()[3]) for x in open(options.s+'/peak/transposase_bias_filtered.bed').readlines()])
 61 |     TFmotif = numpy.asarray([x for x in TFmotif_origin if x.sum() > 0])
 62 |     TFmotif[numpy.where(TFmotif > 0)] = 1
 63 |     TFnames = [x for i,x in enumerate(TFnames) if TFmotif_origin[i, :].sum() > 0]
 64 |     return TFmotif, reads, TFnames, cell_names, GC_bias
 65 | #
 66 | #
 67 | def permuted_sampling(options):
 68 |     peak_reads = numpy.log10(reads.sum(axis=0)+1.0)
 69 |     ngrid, std = 50, 1
 70 |     print GC_bias.shape, peak_reads.shape
 71 |     samples = Bias_corrected_deviation.batch_sampling(GC_bias, peak_reads, ngrid, std, int(options.ns), int(options.np))
 72 |     print 'permuted sampling done!'
 73 |     return samples
 74 | #
 75 | #
 76 | def raw_deviation(options):
 77 |     expected = Bias_corrected_deviation.expected_matrix(reads, TFmotif, GC_bias)
 78 |     raw_dev = Bias_corrected_deviation.raw_deviation(TFmotif, reads, expected)
 79 |     numpy.savetxt(options.s+'/result/raw_deviation.txt', raw_dev)
 80 |     print 'raw deviation done!'
 81 |     return expected, raw_dev
 82 | #
 83 | #
 84 | def background_deviation(iIter):
 85 |     numpy.random.seed(12345+iIter)
 86 |     (nCell, nPeak) = reads.shape
 87 |     (nTF, nPeak) = TFmotif.shape
 88 |     background_dev = numpy.zeros((nTF, nCell))
 89 |     B_matrix = numpy.zeros((nPeak, nPeak))
 90 |     for iPeak in range(0, nPeak):
 91 |         B_matrix[iPeak, int(samples[iIter, iPeak])] = 1
 92 |     background_dev = Bias_corrected_deviation.deviation(TFmotif, B_matrix, reads, expected)
 93 |     print 'background deviation for sample '+str(iIter+1)+' done!'
 94 |     return background_dev
 95 | #
 96 | #
 97 | def corrected_deviation(options):
 98 |     kIterations = numpy.arange(0, int(options.ns), 1, dtype=int)
 99 |     pool = Pool(int(options.np))
100 |     bg_dev = pool.map(background_deviation, kIterations)
101 |     pool.close()
102 |     pool.join()
103 |     bg_dev = numpy.array(bg_dev)
104 |     print 'background deviations done!'
105 |     bg_dev_mean = bg_dev.mean(axis=0)
106 |     bg_dev_std = bg_dev.std(axis=0)
107 |     raw_dev = numpy.loadtxt(options.s+'/result/raw_deviation.txt')
108 |     corrected_dev = (raw_dev - bg_dev_mean) / bg_dev_std
109 |     dev_df = pandas.DataFrame(corrected_dev, index=TFnames, columns=cell_names)
110 |     dev_df.to_csv(options.s+'/result/deviation_chromVAR.csv', sep=',')
111 |     return
112 | #
113 | #
114 | def cell_cluster(options):
115 |     reads_df = pandas.read_csv(options.s+'/result/deviation_chromVAR.csv', sep=',', index_col=0,
116 |                    engine='c', na_filter=False, low_memory=False)
117 |     matrix = reads_df.T
118 |     connect = kneighbors_graph(matrix, n_neighbors=20, include_self=False)
119 |     connectivity = 0.5*(connect + connect.T)
120 |     if int(options.nc)==0:
121 |         n_clust, clusters = subroutines.predict_cluster(matrix, connectivity.todense())
122 |         print "predicted number of cell-clusters: ", n_clust
123 |         clusters.to_csv(options.s+'/result/louvain_cluster_by_chromVAR.csv', sep='\t')
124 |         tsne_result = McTSNE(n_components=2, random_state=int(options.rs)).fit_transform(matrix.values)
125 |         subroutines.plot_cluster(options, clusters, n_clust, tsne_result, 'louvain_cluster_by_chromVAR.pdf')
126 |     else:
127 |         n_clust = int(options.nc)
128 |         clusters = subroutines.knn_cluster(options, matrix, n_clust, connectivity, "KNN_cluster_by_chromVAR.csv")
129 |         tsne_result = McTSNE(n_components=2, random_state=int(options.rs)).fit_transform(matrix.values)
130 |         subroutines.plot_cluster(options, clusters, n_clust, tsne_result, 'KNN_cluster_by_chromVAR.pdf')
131 | #
132 |     subroutines.plot_tSNE(options, matrix, tsne_result, "TSNE_by_chromVAR.pdf")
133 |     tsne_df = pandas.DataFrame(tsne_result, index=matrix.index, columns=['TSNE1', 'TSNE2'])
134 |     tsne_df.to_csv(options.s+'/result/TSNE_by_chromVAR.csv', sep='\t')
135 | #
136 |     if options.hc=='yes':
137 |         subroutines.hierarchy_cluster(options, reads_df.corr(), n_clust, "cell_cell_correlation_by_chromVAR.png",
138 |                                   "Hierarchical_cluster_by_chromVAR.csv")
139 |     return
140 | #
141 | #
142 | #t1=time.time()
143 | TFmotif, reads, TFnames, cell_names, GC_bias = initiation(options)
144 | samples = permuted_sampling(options)
145 | expected, raw_dev = raw_deviation(options)
146 | corrected_deviation(options)
147 | cell_cluster(options)
148 | #print time.time()-t1
149 | #
150 | #
151 | #
152 | #
153 | #
154 | 


--------------------------------------------------------------------------------
/code_v1.0.5/cluster_comparison.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | #
 5 | import numpy
 6 | import pandas
 7 | from optparse import OptionParser
 8 | import matplotlib
 9 | matplotlib.use('Agg')
10 | import matplotlib.pyplot as plt
11 | import scipy.special
12 | #
13 | #
14 | pts = OptionParser()
15 | usage = "Compare clustering method\nusage: %prog --c1 cell_info.csv --c2 KNN_cluster.csv"
16 | opts = OptionParser(usage=usage, version="%prog 1.0.5")
17 | opts.add_option("--c1", help="cell_info.csv in <data> folder, or cluster_by_XXX.csv in <result> folder")
18 | opts.add_option("--c2", help="cell cluster file different with c1")
19 | options, arguments = opts.parse_args()
20 | #
21 | #
22 | cluster_df1 = pandas.read_csv(options.c1, sep='\t', index_col=0)
23 | cluster_df2 = pandas.read_csv(options.c2, sep='\t', index_col=0)
24 | if 'notes' in cluster_df1.columns.values: cluster_df1['cluster']=cluster_df1['notes']
25 | if 'notes' in cluster_df2.columns.values: cluster_df2['cluster']=cluster_df2['notes']
26 | clusters1 = list(set(cluster_df1['cluster'].values))
27 | clusters2 = list(set(cluster_df2['cluster'].values))
28 | contingency = numpy.zeros((len(clusters1), len(clusters2)), dtype=int)
29 | for i,clust_i in enumerate(clusters1):
30 |     index_i = numpy.where(cluster_df1['cluster'].values==clust_i)[0]
31 |     cells_i = cluster_df1.index.values[index_i]
32 |     for j,clust_j in enumerate(clusters2):
33 |         index_j = numpy.where(cluster_df2['cluster'].values==clust_j)[0]
34 |         cells_j = cluster_df2.index.values[index_j]
35 |         overlap = list(set(cells_i).intersection(set(cells_j)))
36 |         contingency[i,j] = len(overlap)
37 | contingency_df = pandas.DataFrame(contingency, index=clusters1, columns=clusters2)
38 | print contingency_df
39 | #
40 | sum_ai = 0
41 | for ai in contingency.sum(axis=1):
42 |     sum_ai += scipy.special.binom(ai, 2)
43 | sum_bj = 0
44 | for bj in contingency.sum(axis=0):
45 |     sum_bj += scipy.special.binom(bj, 2)
46 | sum_nij = 0
47 | for row in contingency:
48 |     for nij in row:
49 |         sum_nij += scipy.special.binom(nij, 2)
50 | n_binom = scipy.special.binom(contingency.sum(), 2)
51 | ari = (sum_nij - sum_ai*sum_bj/n_binom) / (0.5*(sum_ai+sum_bj)
52 |       - sum_ai*sum_bj/n_binom)
53 | print 'ARI=', ari
54 | #
55 | #
56 | #
57 | #
58 | #
59 | 


--------------------------------------------------------------------------------
/code_v1.0.5/generate_UCSCtrack.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | #
 5 | import os
 6 | import numpy
 7 | import pandas
 8 | from optparse import OptionParser
 9 | #
10 | #
11 | opts = OptionParser()
12 | usage = "Merge counts of cells to track files\nusage: %prog -s project --cfile cluster.csv --gsize chrom.sizes"
13 | opts = OptionParser(usage=usage, version="%prog 1.0")
14 | opts.add_option("-s", help="The project folder.")
15 | opts.add_option("--cfile", help="cluster.csv file, e.g. louvain_cluster_by_Accesson.csv in <result> folder")
16 | opts.add_option("--gsize", default='../reference/hg19.chrom.sizes', help="chrom.size files, default=../reference/hg19.chrom.sizes")
17 | options, arguments = opts.parse_args()
18 | #
19 | #
20 | def merge_bam(options):
21 |     bam_folder = [x for x in os.listdir(options.s+'/work')]
22 |     bam_folder.sort()
23 |     cell_df = pandas.read_csv(options.cfile, sep='\t', index_col=0)
24 |     if 'cell_info' in options.cfile: cell_df['cluster'] = cell_df['notes']
25 |     cell_types = cell_df['cluster'].values
26 |     cell_types = list(set(cell_types))
27 |     for cell_type in cell_types:
28 |         marked_bam, select = [], []
29 |         for folder in bam_folder:
30 |             path = options.s + '/work/' + folder + '/'
31 |             if folder in cell_df.index.values:
32 |                 if cell_df.ix[folder, 'cluster']==cell_type:
33 |                     select.append(folder)
34 |                     marked_bam.extend([path + x for x in os.listdir(path) if x[-10:]=='marked.bam'])
35 | #        print select
36 |         marked_bam = ' '.join(marked_bam)
37 |         merged_bam = options.s + '/result/track/' + str(cell_type) + '.bam'
38 |         os.popen('samtools merge -f ' + merged_bam + ' ' + marked_bam)
39 |         os.popen('samtools index ' + merged_bam)
40 |     return
41 | #
42 | #
43 | def bam2bw(options):
44 |     cell_df = pandas.read_csv(options.cfile, sep='\t', index_col=0)
45 |     if 'cell_info' in options.cfile: cell_df['cluster'] = cell_df['notes']
46 |     cell_types = cell_df['cluster'].values
47 |     cell_types = list(set(cell_types))
48 |     for cell_type in cell_types:
49 |         name = options.s+'/result/track/'+str(cell_type)
50 |         cells = cell_df.loc[cell_df['cluster']==cell_type]
51 |         cells = cells.index.values
52 |         os.popen('bedtools genomecov -bg -ibam '+name+'.bam -g '+options.gsize+' > '+name+'.bedgraph')
53 |         os.popen('bedtools sort -i '+name+'.bedgraph > '+name+'.sorted.bedgraph')
54 |         counts = numpy.array([int(x.split()[3]) for x in open(name+'.sorted.bedgraph').readlines()])
55 |         total = counts.sum()
56 |         with open(name+'.sorted.bedgraph') as infile, open(name+'.norm.bedgraph', 'w') as outfile:
57 |             for line in infile:
58 |                 words = line.split()
59 | #                words[3] = str(round(float(words[3]) * 1.0e7 / total))
60 |                 words[3] = str(round(float(words[3]) * 100.0 / len(cells)))
61 |                 print >> outfile, '\t'.join(words)
62 |         os.popen('bedGraphToBigWig '+name+'.norm.bedgraph '+options.gsize+' '+name+'.bw')
63 |     return
64 | #
65 | #
66 | #
67 | os.popen('mkdir ' + options.s + '/result/track')
68 | merge_bam(options)
69 | bam2bw(options)
70 | #
71 | #
72 | #
73 | 


--------------------------------------------------------------------------------
/code_v1.0.5/generate_differential_Accesson.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | #
 5 | import numpy
 6 | import pandas
 7 | from optparse import OptionParser
 8 | from scipy import stats
 9 | import os
10 | import sys
11 | from multiprocessing import Pool
12 | import subroutines
13 | #
14 | #
15 | opts = OptionParser()
16 | usage = "Enriched accessons of a cluster (batch) \nusage: %prog -s project --cfile cluster.csv --cluster 1 --vs 2,3"
17 | opts = OptionParser(usage=usage, version="%prog 1.0")
18 | opts.add_option("-s", help="The project folder.")
19 | opts.add_option("--cfile", help="cluster.csv file of a clustering method, e.g. louvain_cluster_by_Accesson.csv in result folder")
20 | opts.add_option("--cluster", help="the cluster(s) for specific-TF analysis, can be {0, 1, ..., nCluster}, or a batch of clusters like 0,2,3")
21 | opts.add_option("--vs", default='all', help="vs which cluster(s) to search specific TF for target cluster(s), e.g. 1,4,2, default=all")
22 | opts.add_option("--pvalue", default=0.001, help='P-value threshold for specific peaks, default=0.001')
23 | opts.add_option("--fold", default=2, help='Fold change cutoff of specific peaks, default=2')
24 | options, arguments = opts.parse_args()
25 | #
26 | #
27 | subname = '_'.join(options.cluster.split(','))
28 | if options.vs!='all':
29 |     subname += '_VS_' + '_'.join(options.vs.split(','))
30 | subroutines.specific_accesson(options, subname)
31 | #
32 | #


--------------------------------------------------------------------------------
/code_v1.0.5/generate_differential_markers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | #
 5 | import numpy
 6 | import pandas
 7 | from optparse import OptionParser
 8 | from scipy import stats
 9 | import os
10 | import sys
11 | from multiprocessing import Pool
12 | import subroutines
13 | #
14 | #
15 | opts = OptionParser()
16 | usage = "Enriched motifs/genes/peaks of a cluster (batch)\nusage: %prog -s project --cfile cluster.csv --cluster 1 --vs 2,3"
17 | opts = OptionParser(usage=usage, version="%prog 1.0")
18 | opts.add_option("-s", help="The project folder.")
19 | opts.add_option("--cfile", help="cluster.csv file of a clustering method, e.g. louvain_cluster_by_Accesson.csv in result folder")
20 | opts.add_option("--cluster", help="The cluster for differential markers analysis, can be {0, 1, ..., nCluster}, or a batch of clusters like 0,2,3")
21 | opts.add_option("--vs", default='all', help="vs which clusters to search differential markers for target clusters, e.g. 1,4,2, default=all")
22 | opts.add_option("--pvalue", default=0.001, help='P-value threshold for differential markers, default=0.001')
23 | opts.add_option("--fold", default=2, help='Fold change cutoff of differential markers, default=2')
24 | opts.add_option("--motif", default='no', help='Whether to search differential motifs for target cluster, default=no.')
25 | opts.add_option("--gene", default='no', help='Whether to search differential genes for target cluster, default=no.')
26 | options, arguments = opts.parse_args()
27 | #
28 | #
29 | def group_cells(options):
30 | #    deviation = pandas.read_csv(options.s+'/result/deviation_chromVAR.csv', sep=',', index_col=0,
31 | #                   engine='c', na_filter=False, low_memory=False)
32 |     cells_df = pandas.read_csv(options.s+'/matrix/filtered_cells.csv', sep='\t', index_col=0,
33 |                    engine='c', na_filter=False, low_memory=False)
34 |     cluster_df = pandas.read_csv(options.cfile, sep='\t', index_col=0)
35 |     kCluster = options.cluster.split(',')
36 |     if options.vs!='all': vsCluster = options.vs.split(',')
37 |     if 'cluster' not in cluster_df.columns.values:
38 |         cluster_df['cluster'] = cluster_df['notes']
39 |     else:
40 |         kCluster = map(int, kCluster)
41 |         if options.vs!='all': vsCluster = map(int, vsCluster)
42 |     if options.vs=='all': vsCluster = list(set(cluster_df['cluster'].values)-set(kCluster))
43 |     cluster_df = cluster_df.loc[cells_df.index.values]
44 |     cell_inCluster = cluster_df.loc[cluster_df['cluster'].isin(kCluster)].index.values
45 |     cell_outCluster = cluster_df.loc[cluster_df['cluster'].isin(vsCluster)].index.values
46 |     print len(cell_inCluster), len(cell_outCluster)
47 |     return cell_inCluster, cell_outCluster
48 | #
49 | #
50 | def get_diff_motifs(options, subname, cell_inCluster, cell_outCluster):
51 |     deviation = pandas.read_csv(options.s+'/result/deviation_chromVAR.csv', sep=',', index_col=0,
52 |                    engine='c', na_filter=False, low_memory=False)
53 |     dev_in, dev_out = deviation[cell_inCluster].values, deviation[cell_outCluster].values
54 |     mean_in, mean_out = dev_in.mean(axis=1), dev_out.mean(axis=1)
55 |     delta = mean_in - mean_out
56 |     ttest, pvalues = stats.ttest_ind(dev_in.T, dev_out.T, equal_var=False)
57 |     matrix = numpy.array([mean_in, mean_out, delta, pvalues]).T
58 |     columns = ['dev_inCluster', 'dev_outCluster', 'd_dev', 'P-value']
59 |     compare_df = pandas.DataFrame(matrix, index=deviation.index, columns=columns)
60 |     compare_df = compare_df.loc[compare_df['d_dev']>=1]
61 |     compare_df = compare_df.loc[compare_df['P-value']<=float(options.pvalue)]
62 |     compare_df = compare_df.sort_values(by=['P-value'])
63 |     compare_df.to_csv(options.s+'/result/motifs_of_cluster_'+subname+'.csv', sep='\t')
64 |     return
65 | #
66 | #
67 | def get_diff_genes(options, subname, cell_inCluster, cell_outCluster):
68 |     expr = pandas.read_csv(options.s+'/matrix/genes_scored_by_peaks.csv', sep=',', index_col=0,
69 |                    engine='c', na_filter=False, low_memory=False)
70 |     dev_in, dev_out = expr[cell_inCluster].values, expr[cell_outCluster].values
71 |     mean_in, mean_out = dev_in.mean(axis=1), dev_out.mean(axis=1)
72 |     delta = (mean_in + 1e-4) / (mean_out + 1e-4)
73 |     ttest, pvalues = stats.ttest_ind(dev_in.T, dev_out.T, equal_var=False)
74 |     matrix = numpy.array([mean_in, mean_out, delta, pvalues]).T
75 |     columns = ['expr_inCluster', 'expr_outCluster', 'fold', 'P-value']
76 |     compare_df = pandas.DataFrame(matrix, index=expr.index, columns=columns)
77 |     compare_df = compare_df.loc[compare_df['fold']>=float(options.fold)]
78 |     compare_df = compare_df.loc[compare_df['P-value']<=float(options.pvalue)]
79 |     compare_df = compare_df.sort_values(by=['P-value'])
80 |     compare_df.to_csv(options.s+'/result/genes_of_cluster_'+subname+'.csv', sep='\t')
81 |     return
82 | #
83 | #
84 | subname = '_'.join(options.cluster.split(','))
85 | if options.vs!='all':
86 |     subname += '_VS_' + '_'.join(options.vs.split(','))
87 | subroutines.specific_peak(options, subname)
88 | cells_in, cells_out = group_cells(options)
89 | if options.motif=='yes':
90 |     get_diff_motifs(options, subname, cells_in, cells_out)
91 | if options.gene=='yes':
92 |     get_diff_genes(options, subname, cells_in, cells_out)
93 | #
94 | #
95 | #
96 | #
97 | 


--------------------------------------------------------------------------------
/code_v1.0.5/generate_markers_on_plots.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import warnings
  3 | warnings.filterwarnings("ignore")
  4 | #
  5 | import numpy
  6 | import pandas
  7 | from optparse import OptionParser
  8 | import matplotlib
  9 | matplotlib.use('Agg')
 10 | import matplotlib.pyplot as plt
 11 | from mpl_toolkits.mplot3d import Axes3D
 12 | import sys
 13 | import scipy.stats
 14 | #
 15 | #
 16 | opts = OptionParser()
 17 | usage = "Render marker on tSNE or trajectory plot\nusage: %prog -s project --cfile TSNE_by_Accesson.csv --type motif --name RELA"
 18 | opts = OptionParser(usage=usage, version="%prog 1.0")
 19 | opts.add_option("-s", help="The project folder.")
 20 | opts.add_option("--cfile", help="TSNE_by_Accesson.csv or monocle_reduced_dimension.csv file to use for rendering")
 21 | opts.add_option("--type", default='motif', help="type of marker to plot, can be motif, gene, or accesson")
 22 | opts.add_option("--name", help="name of marker to plot")
 23 | opts.add_option("--angle", default='30,30', help='Angles to rotate the 3D trajectory, default=30,30')
 24 | opts.add_option("--sharp", default='0', help='Cutoff range for deviation or expression, default=0, i.e. no sharpening')
 25 | options, arguments = opts.parse_args()
 26 | #
 27 | #
 28 | def draw_marker(options):
 29 |     if 'monocle_reduced' in options.cfile:
 30 |         tsne_df = pandas.read_csv(options.cfile, sep=',', index_col=0).T
 31 |     else:
 32 |         tsne_df = pandas.read_csv(options.cfile, sep='\t', index_col=0)
 33 |     tsne, cells = tsne_df.values, tsne_df.index.values
 34 |     if options.type=='motif':
 35 |         reads_df = pandas.read_csv(options.s+'/result/deviation_chromVAR.csv', sep=',', index_col=0,
 36 |                    engine='c', na_filter=False, low_memory=False).T
 37 |         motifs = []
 38 |         for tf in reads_df.columns.values:
 39 |             names = tf.split('-')[:-1]
 40 |             if options.name in names: motifs.append(tf)
 41 |         print motifs
 42 |         if len(motifs)==0:
 43 |             print "No corresponding marker!"
 44 |             sys.exit()
 45 |         else:
 46 |             reads = reads_df.ix[cells, motifs].values.sum(axis=1)
 47 |     elif options.type=='gene':
 48 |         reads_df = pandas.read_csv(options.s+'/matrix/genes_scored_by_peaks.csv', sep=',', index_col=0,
 49 |                    engine='c', na_filter=False, low_memory=False).T
 50 |         gene = list(set([options.name]).intersection(set(reads_df.columns.values)))
 51 |         if len(gene)==0:
 52 |             print "No corresponding marker!"
 53 |             sys.exit()
 54 |         else:
 55 |             reads = reads_df.loc[cells, gene].values.sum(axis=1)
 56 |     elif options.type=='accesson':
 57 |         reads_df = pandas.read_csv(options.s+'/matrix/Accesson_reads.csv', sep=',', index_col=0,
 58 |                    engine='c', na_filter=False, low_memory=False)
 59 |         normal = numpy.array([x/x.sum()*10000 for x in reads_df.values])
 60 |         reads_df = pandas.DataFrame(normal, index=reads_df.index, columns=reads_df.columns)
 61 |         reads = reads_df.loc[cells, options.name].values
 62 |     order = numpy.argsort(reads)
 63 |     if str(options.sharp)!='0': 
 64 |         up, down = int(options.sharp.split(',')[-1]), int(options.sharp.split(',')[0])
 65 |         reads = numpy.clip(reads, down, up)
 66 | #
 67 |     if 'monocle_reduced' in options.cfile:
 68 |         clist = ['blue', 'silver', 'red']
 69 |         cmap = matplotlib.colors.LinearSegmentedColormap.from_list('mylist', clist, N=256)
 70 |         fig1 = plt.figure(1, figsize=(12,10))
 71 |         ax = fig1.add_subplot(111, projection='3d')
 72 |         im = ax.scatter(tsne[order,0], tsne[order,1], tsne[order,2], cmap=cmap, c=reads[order], edgecolors='none', s=10)
 73 |         beta1, beta2 = int(options.angle.split(',')[0]), int(options.angle.split(',')[1])
 74 |         ax.view_init(beta1, beta2)
 75 |         cbar = plt.colorbar(im, shrink=0.15, ticks=[reads.min(), reads.max()], aspect=8)
 76 |         cbar.ax.set_yticklabels([round(reads.min(),2), round(reads.max(),2)])
 77 |         width, height, rad = tsne[:,0].max()-tsne[:,0].min(), tsne[:,1].max()-tsne[:,1].min(), tsne[:,2].max()-tsne[:,2].min()
 78 |         ax.set_xlim((tsne[:,0].min()-0.01*width, tsne[:,0].max()+0.01*width))
 79 |         ax.set_ylim((tsne[:,1].min()-0.01*height, tsne[:,1].max()+0.01*height))
 80 |         ax.set_zlim((tsne[:,2].min()-0.01*rad, tsne[:,2].max()+0.01*rad))
 81 |         ax.set_zticks([])
 82 |     else:
 83 |         fig1 = plt.figure(1, figsize=(6,5))
 84 |         ax = fig1.add_subplot(111)
 85 |         im = ax.scatter(tsne[order,0], tsne[order,1],  cmap='Spectral_r', c=reads[order], edgecolors='none', s=20)
 86 |         cbar = plt.colorbar(im, shrink=0.15, ticks=[reads.min(), reads.max()], aspect=8)
 87 |         cbar.ax.set_yticklabels([round(reads.min(),2), round(reads.max(),2)])
 88 | #        width, height = tsne[:,0].max()-tsne[:,0].min(), tsne[:,1].max()-tsne[:,1].min()
 89 | #        ax.set_xlim((tsne[:,0].min()-0.01*width, tsne[:,0].max()+0.01*width))
 90 | #        ax.set_ylim((tsne[:,1].min()-0.01*height, tsne[:,1].max()+0.01*height))
 91 |     ax.grid(False)
 92 |     ax.set_xticks([])
 93 |     ax.set_yticks([])
 94 |     ax.set_title(options.name)
 95 |     fig1.savefig(options.s+'/figure/'+options.type+'_'+options.name+'_on_'+options.cfile.split('/')[-1].split('.')[0]+'.pdf',
 96 |                  bbox_inches='tight')
 97 |     return
 98 | #
 99 | #
100 | draw_marker(options)
101 | #
102 | #
103 | #
104 | #
105 | 


--------------------------------------------------------------------------------
/code_v1.0.5/generate_superEnhancer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | #
 5 | import numpy
 6 | import pandas
 7 | from optparse import OptionParser
 8 | import matplotlib
 9 | matplotlib.use('Agg')
10 | import matplotlib.pyplot as plt
11 | import sys
12 | import scipy.stats
13 | #
14 | #
15 | opts = OptionParser()
16 | usage = "Search super enhancers\nusage: %prog -s project"
17 | opts = OptionParser(usage=usage, version="%prog 1.0")
18 | opts.add_option("-s", help="The project folder.")
19 | options, arguments = opts.parse_args()
20 | #
21 | def search_supper(options):
22 |     supper_range = 1e6
23 |     peaks = numpy.array([x.split()[:3] for x in open(options.s+'/peak/top_peaks.bed').readlines()])
24 |     chroms = list(set(peaks[:, 0]))
25 |     chroms.sort()
26 |     accesson_df = pandas.DataFrame.from_csv(options.s+'/matrix/Accesson_peaks.csv', sep='\t')
27 |     accessons = list(set(accesson_df['group'].values))
28 |     all_suppers, all_locate, all_base = [], [], []
29 |     for access in accessons:
30 |         peaks_group = accesson_df.loc[accesson_df['group']==access].index.values
31 |         peaks_index = [int(x[4:]) for x in peaks_group]
32 |         peaks_info = peaks[peaks_index, :]
33 |         peaks_dict = {x:[] for x in chroms}
34 |         peaks_label = {x:[] for x in chroms}
35 |         for ip,pp in enumerate(peaks_info):
36 |             peaks_dict[pp[0]].append((int(pp[1])+int(pp[2]))/2)
37 |             peaks_label[pp[0]].append(peaks_index[ip])
38 |         peaks_label = {x:numpy.array(peaks_label[x]) for x in peaks_label.keys()}
39 |         suppers, locate, base = [], [], []
40 |         for chrom in peaks_dict.keys():
41 |             if len(peaks_dict[chrom])>=2:
42 |                 position = numpy.array(peaks_dict[chrom])
43 |                 supper = [numpy.where(abs(position-x)<=supper_range)[0] for x in position if 
44 |                       len(numpy.where(abs(position-x)<=supper_range)[0])>1]
45 |                 supper = ['-'.join(map(str, x)) for x in supper]
46 |                 supper = list(set(supper))
47 |                 supper = [list(map(int, x.split('-'))) for x in supper]
48 |                 supper_peaks = numpy.array(['-'.join(map(str, peaks_label[chrom][x])) for x in supper])
49 |                 if len(supper_peaks)>0:
50 |                     for ii,ss in enumerate(supper_peaks):
51 |                         peaks_in = map(int, ss.split('-'))
52 |                         start = peaks[peaks_in, 1:].astype(int).min()
53 |                         end = peaks[peaks_in, 1:].astype(int).max()
54 |                         delta = numpy.array([abs(peaks_in[i+1]-x) for i,x in enumerate(peaks_in[:-1])])
55 |                         close = numpy.where(delta<=2)[0]
56 |                         percent = len(close)/float(len(delta))
57 |                         if (len(delta)>=2) & (percent>0.5) :
58 |                             suppers.append(ss)
59 |                             locate.append(access)
60 |                             base.append(chrom+':'+str(start)+'-'+str(end))
61 |         all_suppers.extend(suppers)
62 |         all_locate.extend(locate)
63 |         all_base.extend(base)
64 |     supper_df = pandas.DataFrame(numpy.array([all_locate, all_base]).T, index=all_suppers, columns=['peaks', 'position'])
65 |     supper_df.to_csv(options.s+'/result/potential_super_enhancer.csv', sep='\t')
66 |     return
67 | #
68 | #
69 | #
70 | #
71 | search_supper(options)
72 | #
73 | #
74 | #
75 | #
76 | 


--------------------------------------------------------------------------------
/code_v1.0.5/prepare_countMatrix.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | import os
 5 | from optparse import OptionParser
 6 | import subroutines
 7 | #
 8 | opts = OptionParser()
 9 | usage = "Align reads to build matrix\nusage: %prog -s project --fa chr.fa --bg bg --meme motif.meme --np 4"
10 | opts = OptionParser(usage=usage, version="%prog 1.0")
11 | opts.add_option("-s", help="The project folder.")
12 | opts.add_option("--fa", default='../reference/hg19_chr.fa', help="Genome fasta file, default=../reference/hg19_chr.fa")
13 | opts.add_option("--bg", default='../reference/tier1_markov1.norc.txt',
14 |                 help="Background file, default=../reference/tier1_markov1.norc.txt")
15 | opts.add_option("--pvalue", default=0.00005, help="P-value threshold for FIMO, default=0.00005")
16 | opts.add_option("--meme", default='../reference/JASPAR2018_CORE_vertebrates_redundant_pfms_meme.txt',
17 |                 help="Motif file, default=../reference/JASPAR2018_CORE_vertebrates_redundant_pfms_meme.txt")
18 | opts.add_option("--np", default=1, help="Number of CPU cores used for motif searching, default=1")
19 | options, arguments = opts.parse_args()
20 | #
21 | matrix_folder = options.s + '/matrix/'
22 | peaks_folder = options.s + '/peak/'
23 | work_folder = options.s + '/work/'
24 | motif_folder = matrix_folder + '/motif'
25 | peaks_file = peaks_folder + '/top_peaks.bed'
26 | cell_info = options.s + '/data/cell_info.csv'
27 | #
28 | if not os.path.exists(matrix_folder): os.popen('mkdir ' + matrix_folder)
29 | if os.path.exists(motif_folder): os.popen('rm -rf ' + motif_folder)
30 | os.popen('mkdir ' + motif_folder)
31 | #
32 | #### run FIMO for motif-site searching
33 | motifFasta = matrix_folder + '/motif.fasta'
34 | os.popen('bedtools getfasta -fi ' + options.fa + ' -bed ' + peaks_file + ' -fo ' + motifFasta)
35 | subroutines.batch_fimo(options.bg, options.pvalue, options.meme, motifFasta, motif_folder, int(options.np))
36 | #
37 | #### motif annotation
38 | TFmatrix_file = matrix_folder + '/motif_TF.csv'
39 | subroutines.score_peaks(peaks_file, motif_folder, TFmatrix_file)
40 | #
41 | #### count reads for peaks
42 | bam_file = peaks_folder + "/mergeAll.bam"
43 | reads_matrix = matrix_folder + "/reads.csv"
44 | matrix_df = subroutines.counts_per_peak(bam_file, peaks_file, reads_matrix)
45 | matrix_df.to_csv(reads_matrix, sep=',')
46 | #
47 | #
48 | subroutines.QC_table(cell_info, work_folder, matrix_folder)
49 | #
50 | #
51 | 


--------------------------------------------------------------------------------
/code_v1.0.5/prepare_geneScore.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import warnings
  3 | warnings.filterwarnings("ignore")
  4 | import os
  5 | import numpy
  6 | import pandas
  7 | import sys
  8 | from optparse import OptionParser
  9 | import time
 10 | import numba
 11 | #
 12 | #
 13 | opts = OptionParser()
 14 | usage = "Evaluate gene score by TSS peaks\nusage: %prog -s project --gtf hg19.gtf --distal 20000"
 15 | opts = OptionParser(usage=usage, version="%prog 1.0")
 16 | opts.add_option("-s", help="The project folder.")
 17 | opts.add_option("--gtf", default='../reference/hg19_RefSeq_genes.gtf', 
 18 |                 help="gtf file for genome, default=../reference/hg19_RefSeq_genes.gtf")
 19 | opts.add_option("--distal", default=20000, 
 20 |                 help="distal region around TSS for peak searching, default=20000")
 21 | options, arguments = opts.parse_args()
 22 | #
 23 | #
 24 | def get_tss_region(options):
 25 |     mm10_df = pandas.read_csv(options.gtf, sep='\t', index_col=0)
 26 |     genes = list(set(mm10_df['name2']))
 27 |     genes.sort()
 28 |     mm10_df.index = mm10_df['name']
 29 |     names, tss = [], []
 30 |     for symbol in genes:
 31 |         sub_df = mm10_df.loc[mm10_df['name2']==symbol]
 32 |         if len(sub_df.index.values)>=1:
 33 |             chrom = list(set(sub_df['chrom'].values))
 34 |             strand = list(set(sub_df['strand'].values))
 35 |             if len(chrom)==1:
 36 |                 if strand[0]=='+':
 37 |                     starts = list(set(map(str, sub_df['txStart'].values)))
 38 |                     start = ','.join(starts)
 39 |                 elif strand[0]=='-':
 40 |                     starts = list(set(map(str, sub_df['txEnd'].values)))
 41 |                     start = ','.join(starts)
 42 |                 names.append(symbol)
 43 |                 tss.append([chrom[0], start])
 44 |     tss = numpy.array(tss)
 45 |     tss_df = pandas.DataFrame(tss, index=names, columns=['chrom', 'tss'])
 46 |     tss_df.to_csv(options.s+'/peak/genes_tss_region.csv', sep='\t')
 47 |     return
 48 | #
 49 | #
 50 | def get_tss_peaks(options):
 51 |     peaks = [[x.split()[0], (int(x.split()[1])+int(x.split()[2]))/2]
 52 |              for x in open(options.s+'/peak/top_filtered_peaks.bed').readlines()]
 53 |     peaks_df = pandas.DataFrame(peaks, index=['peak'+str(x) for x in xrange(len(peaks))], 
 54 |                                 columns=['chrom', 'center'])
 55 |     tss_df = pandas.read_csv(options.s+'/peak/genes_tss_region.csv', sep='\t', index_col=0)
 56 |     for gene in tss_df.index.values:
 57 |         chrom, tsses = tss_df.ix[gene, 'chrom'], tss_df.ix[gene, 'tss']
 58 |         tsses = map(int, tsses.split(','))
 59 |         chr_peaks = peaks_df.loc[peaks_df['chrom']==chrom]
 60 |         proxim_peaks, distal_peaks = [], []
 61 |         for tss in tsses:
 62 |             peaks1 = chr_peaks.loc[abs(chr_peaks['center']-tss)<=2000].index.values
 63 |             peaks2 = chr_peaks.loc[abs(chr_peaks['center']-tss)<=int(options.distal)].index.values
 64 |             proxim_peaks.extend(peaks1)
 65 |             distal_peaks.extend(peaks2)
 66 |         proxim_peaks = list(set(proxim_peaks))
 67 |         distal_peaks = list(set(distal_peaks)-set(proxim_peaks))
 68 |         if len(proxim_peaks)==0: proxim_peaks = ['NONE'] 
 69 |         if len(distal_peaks)==0: distal_peaks = ['NONE']
 70 |         proxim_peaks = ';'.join(proxim_peaks)
 71 |         tss_df.ix[gene, 'proximal'] = proxim_peaks
 72 |         distal_peaks = ';'.join(distal_peaks)
 73 |         tss_df.ix[gene, 'distal'] = distal_peaks
 74 |     tss_df.to_csv(options.s+'/peak/genes_tss_peaks.csv', sep='\t')
 75 |     return
 76 | #
 77 | #
 78 | def get_score_from_peaks(options):
 79 |     tss_df = pandas.read_csv(options.s+'/peak/genes_tss_peaks.csv', sep='\t', index_col=0)
 80 |     reads_df = pandas.read_csv(options.s+'/matrix/filtered_reads.csv', sep=',', index_col=0)
 81 |     all_peaks = reads_df.columns.values
 82 |     genes, score = [], []
 83 |     for igene,gene in enumerate(tss_df.index.values):
 84 |         distal = tss_df.loc[gene, 'distal'].split(';')
 85 |         proximal = tss_df.loc[gene, 'proximal'].split(';')
 86 |         distal = list(set(distal).union(set(proximal)))
 87 |         distal = list(set(distal).intersection(set(all_peaks)))
 88 |         if len(distal)>0:
 89 |             signal = reads_df[distal].values.mean(axis=1)
 90 |             genes.append(gene)
 91 |             score.append(signal)
 92 |     score = numpy.array(score)
 93 |     score_df = pandas.DataFrame(score, index=genes, columns=reads_df.index)
 94 |     score_per_cell = score.sum(axis=0)
 95 |     R_wave = [numpy.log(x*10000.0/score_per_cell[i]+1) for i,x in enumerate(score.T)]
 96 |     R_wave = numpy.array(R_wave)
 97 |     normal_df = pandas.DataFrame(R_wave.T, index=genes, columns=reads_df.index)
 98 |     normal_df.to_csv(options.s+'/matrix/genes_scored_by_peaks.csv', sep=',')
 99 |     return
100 | #
101 | #
102 | t1 = time.time()
103 | get_tss_region(options)
104 | #print time.time()-t1
105 | get_tss_peaks(options)
106 | #print time.time()-t1
107 | get_score_from_peaks(options)
108 | print time.time()-t1
109 | #
110 | #
111 | 


--------------------------------------------------------------------------------
/code_v1.0.5/prepare_mapping.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | import os
 5 | import sys
 6 | from optparse import OptionParser
 7 | import subroutines
 8 | from multiprocessing import Pool
 9 | #
10 | #
11 | opts = OptionParser()
12 | usage = "Map sequencing data\nusage: %prog -s project --index bowtie2-index --picard picard.jar --tss TSS.txt --np 4"
13 | opts = OptionParser(usage=usage, version="%prog 1.0")
14 | opts.add_option("-s", help="The project folder")
15 | opts.add_option("--index", default="../reference/hg19",
16 |                 help="The reference file path in bowtie2/indexes folder, default=../reference/hg19")
17 | opts.add_option("--picard", default="../reference/picard.jar",
18 |                 help="The picard.jar file path, default=../reference/picard.jar")
19 | opts.add_option("--tss", default="../reference/hg19_refseq_genes_TSS.txt",
20 |                 help="The TSS file path, can be downloaded from RefSeq, default=../reference/hg19_refseq_genes_TSS.txt")
21 | opts.add_option("--np", default=1, help="Number of CPUs used for mapping, default=1")
22 | options, arguments = opts.parse_args()
23 | #
24 | #
25 | #
26 | def mapping(par):
27 |     work_dir, cell, options, input1, input2, chr_list = par[0], par[1], par[2], par[3], par[4], par[5]
28 |     sam = work_dir + cell + '.sam'
29 |     bam = work_dir + cell + '.bam'
30 |     log = work_dir + cell + '.map.log'
31 |     sorted_bam = work_dir + cell + '.sorted.bam'
32 |     filtered_bam = work_dir + cell + '.filtered.bam'
33 |     marked_bam = work_dir + cell + '.marked.bam'
34 |     removed_duplicate = work_dir + cell + '.dups.log'
35 |     quality_state = work_dir + cell + '.stats.log'
36 |     hist_log = work_dir + cell + '.hist.log'
37 |     hist_pdf = work_dir + cell + '.hist.pdf'
38 |     refSeqTSS = work_dir + cell + '.RefSeqTSS'
39 |     print cell, work_dir
40 | #
41 | #    os.popen('bowtie2 -X2000 -p ' + options.np + ' --rg-id ' + cell + ' -x ' + options.index
42 | #        + ' -1 ' + input1 + ' -2 ' + input2 + ' -S ' + sam + ' 2> ' + log)
43 |     os.popen('bowtie2 -X2000 -p 1 --rg-id ' + cell + ' -x ' + options.index
44 |         + ' -1 ' + input1 + ' -2 ' + input2 + ' -S ' + sam + ' 2> ' + log)
45 |     os.popen('samtools view -bS ' + sam + ' -o ' + bam)
46 |     os.popen('rm ' + sam)
47 |     os.popen('java -XX:+UseSerialGC -Xmx1g -jar ' + options.picard + ' SortSam SO=coordinate VALIDATION_STRINGENCY=SILENT I='
48 |         + bam + ' O=' + sorted_bam)
49 |     os.popen('samtools index ' + sorted_bam)
50 |     os.popen('samtools view -b -q 30 ' + sorted_bam + ' -o ' + filtered_bam + ' ' + chr_list)
51 |     os.popen('java -XX:+UseSerialGC -Xmx1g -jar ' + options.picard + ' MarkDuplicates INPUT=' + filtered_bam +' OUTPUT='
52 |         + marked_bam + ' METRICS_FILE=' + removed_duplicate
53 |         + ' REMOVE_DUPLICATES=true VALIDATION_STRINGENCY=SILENT')
54 |     os.popen('samtools index ' + marked_bam)
55 |     os.popen('echo -e "Chromosome\tLength\tProperPairs\tBadPairs:Raw" >> ' + quality_state)
56 |     os.popen('samtools idxstats ' + sorted_bam + ' >> ' + quality_state)
57 |     os.popen('java -XX:+UseSerialGC -Xmx1g -jar ' + options.picard + ' CollectInsertSizeMetrics VALIDATION_STRINGENCY=SILENT I='
58 |         + marked_bam + ' O=' + hist_log + ' H=' + hist_pdf + ' W=1000')
59 | #    subroutines.draw_TSS_insert(options.tss, marked_bam, refSeqTSS)
60 |     return
61 | #
62 | #
63 | chr_list = 'chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY'
64 | #
65 | fastq_list = [x for x in os.listdir(options.s+'/data') if x[-11:]=='.trim.fastq']
66 | fastq_list.sort()
67 | cell_list = [x.split('_')[0] for x in fastq_list]
68 | cell_list = list(set(cell_list))
69 | cell_list.sort()
70 | if not os.path.exists(options.s+'/work'): os.popen('mkdir ' + options.s + '/work')
71 | #
72 | parameters = []
73 | for cell in cell_list:
74 |     work_dir = options.s + '/work/' + cell + '/'
75 |     if os.path.exists(work_dir): os.popen('rm -rf ' + work_dir)
76 |     os.popen('mkdir ' + work_dir)
77 |     input1, input2 = options.s+'/data/'+cell+'_1.trim.fastq', options.s+'/data/'+cell+'_2.trim.fastq'
78 |     par = [work_dir, cell, options, input1, input2, chr_list]
79 |     parameters.append(par)
80 | #    mapping(par)
81 | #
82 | pool = Pool(int(options.np))
83 | pool.map(mapping, parameters)
84 | pool.close()
85 | pool.join()
86 | #
87 | #
88 | for cell in cell_list:
89 |     work_dir = options.s + '/work/' + cell + '/'
90 |     marked_bam = work_dir + cell + '.marked.bam'
91 |     refSeqTSS = work_dir + cell + '.RefSeqTSS'
92 |     subroutines.draw_TSS_insert(options.tss, marked_bam, refSeqTSS)
93 | #
94 | #
95 | 


--------------------------------------------------------------------------------
/code_v1.0.5/prepare_peakCalling.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | import warnings
  4 | warnings.filterwarnings("ignore")
  5 | #
  6 | import os
  7 | import numpy
  8 | import sys
  9 | from optparse import OptionParser
 10 | import subroutines
 11 | #
 12 | #
 13 | opts = OptionParser()
 14 | usage = "Call peaks\nusage: %prog -s project --blist blacklist.bed --fa genome_chr.fa --tss tssFile --logq 3"
 15 | opts = OptionParser(usage=usage, version="%prog 1.0")
 16 | opts.add_option("-s", help="The project folder.")
 17 | opts.add_option("--picard", default="../reference/picard.jar",
 18 |                 help="The picard.jar file path, default=../reference/picard.jar")
 19 | opts.add_option("--blist", default='../reference/hg19_blacklist.JDB.bed', 
 20 |                 help="Blacklist.bed, default=../reference/hg19_blacklist.JDB.bed")
 21 | opts.add_option("--fa", default='../reference/hg19_chr.fa', 
 22 |                 help="Genome_chr.fa, default=../reference/hg19_chr.fa")
 23 | opts.add_option('--tss', default='../reference/hg19_refseq_genes_TSS.txt',
 24 |                 help='TSS file, default=../reference/hg19_refseq_genes_TSS.txt')
 25 | opts.add_option('--ref', default='hg19', help='Name of genome reference, default=hg19')
 26 | opts.add_option('--logq', default='3',
 27 |                 help='Threshold of -log(p-value) for top peaks, default=3.')
 28 | options, arguments = opts.parse_args()
 29 | #
 30 | workspace_folder = options.s + '/work/'
 31 | peak_folder = options.s + '/peak/'
 32 | genome_fasta = options.fa
 33 | tssFile = options.tss
 34 | os.popen('mkdir ' + peak_folder)
 35 | #
 36 | #
 37 | print '!!!!!!  merge all marked bam files  !!!!!!'
 38 | bam_folder = [x for x in os.listdir(workspace_folder)]
 39 | bam_folder.sort()
 40 | print 'cells number:', len(bam_folder)
 41 | marked_bam = []
 42 | #merged_raw = peak_folder + 'mergeAll.raw.bam'
 43 | merged_bam = peak_folder + 'mergeAll.bam'
 44 | for folder in bam_folder:
 45 |     path = workspace_folder + folder + '/'
 46 |     if len(folder.split('.'))<=1:
 47 |         marked_bam.extend([path + x for x in os.listdir(path) if x[-10:]=='marked.bam'])
 48 | if len(marked_bam)<=1000:
 49 |     marked_bam = ' '.join(marked_bam)
 50 |     os.popen('samtools merge -f ' + merged_bam + ' ' + marked_bam)
 51 | else:
 52 |     n_batch = len(marked_bam)//1000 + 1
 53 |     temps = []
 54 |     for i_batch in range(0, n_batch):
 55 |         temp_bam = peak_folder+'temp_'+str(i_batch)+'.bam'
 56 |         temps.append(temp_bam)
 57 |         start, end = i_batch*1000, min((i_batch+1)*1000, len(marked_bam))
 58 |         marked = ' '.join(marked_bam[start:end])
 59 |         os.popen('samtools merge -f ' + temp_bam + ' ' + marked)
 60 |         os.popen('samtools index ' + temp_bam)
 61 |     all_temp = ' '.join(temps)
 62 |     os.popen('samtools merge -f ' + merged_bam + ' ' + all_temp)
 63 | #        
 64 | os.popen('samtools index ' + merged_bam)
 65 | print '!!!!!!  merge done  !!!!!!'
 66 | print
 67 | #
 68 | hist_log = peak_folder + 'mergeAll.hist.log'
 69 | hist_pdf = peak_folder + 'mergeAll.hist.pdf'
 70 | os.popen('java -XX:+UseSerialGC -Xmx1g -jar '+options.picard+' CollectInsertSizeMetrics VALIDATION_STRINGENCY=SILENT I='
 71 |     + merged_bam + ' O=' + hist_log + ' H=' + hist_pdf + ' W=1000')
 72 | #
 73 | refSeqTSS = peak_folder + 'mergeAll.RefSeqTSS'
 74 | subroutines.draw_TSS_insert(tssFile, merged_bam, refSeqTSS)
 75 | #
 76 | print '!!!!!!  call peak by macs2  !!!!!!'
 77 | peak_file = peak_folder + 'peaks'
 78 | os.popen('macs2 callpeak --nomodel -t ' + merged_bam + ' -n ' 
 79 |     + peak_file + ' --nolambda --keep-dup all --call-summits')
 80 | print '!!!!!!  call peak done  !!!!!!'
 81 | print
 82 | #
 83 | summit = peak_folder + 'peaks_summits.bed'
 84 | filtered_peak = peak_folder + 'filtered_peaks.bed'
 85 | if options.blist:
 86 |     print '!!!!!!  filter peaks  !!!!!!'
 87 |     os.popen('bedtools intersect -v -a ' + summit + ' -b ' + options.blist
 88 |         + " | sort -k5 -nr > " + filtered_peak)
 89 |     print '!!!!!!  filter peaks done  !!!!!!'
 90 |     print
 91 | else:
 92 |     os.popen('sort -k5 -nr ' + summit + ' > ' + filtered_peak)
 93 | 
 94 | print '!!!!!!  get top N peaks by q-value  !!!!!!'
 95 | fold_rank = numpy.loadtxt(filtered_peak, 'str', delimiter='\t')
 96 | fold_rank[:, 1] = numpy.array(map(int, fold_rank[:, 1])) - 249  # 250
 97 | fold_rank[:, 2] = numpy.array(map(int, fold_rank[:, 2])) + 250
 98 | toppeaks = peak_folder + 'temp01.bed'
 99 | top_peaks = peak_folder + 'top_peaks.bed'
100 | with open(toppeaks, 'w') as output:
101 |     for peak in fold_rank:
102 |         if float(peak[-1])>=float(options.logq):
103 |             print >> output, peak[0]+'\t'+peak[1]+'\t'+peak[2]
104 | os.popen('bedtools sort -i ' + toppeaks + ' > ' + top_peaks)
105 | print '!!!!!!  get top peaks done  !!!!!!'
106 | print
107 | #
108 | #
109 | print '!!!!!!  get transposase bias by GC content  !!!!!!'
110 | trans_bias = peak_folder + 'transposase_bias.bed'
111 | temp02_file = peak_folder + 'temp02.bed'
112 | temp03_file = peak_folder + 'temp03.bed'
113 | with open(top_peaks) as annotate_file, open(temp02_file, 'w') as temp02:
114 |     for i, line in enumerate(annotate_file):
115 |         words = line.split('\t')
116 |         leave = words[0:3]
117 |         print >> temp02, '\t'.join(leave)
118 | # use GC contents to estimate the transposase bias
119 | os.popen('bedtools nuc -fi ' + genome_fasta + ' -bed ' + temp02_file + ' > ' + temp03_file)
120 | with open(temp03_file) as temp03, open(trans_bias, 'w') as bias:
121 |     for i, line in enumerate(temp03):
122 |         if i>0:
123 |             words = line.split('\t')
124 |             leave = words[0:3] + [words[4]]
125 |             print >> bias, '\t'.join(leave)
126 | print '!!!!!!  get bias done  !!!!!!'
127 | #
128 | #
129 | 
130 | 


--------------------------------------------------------------------------------
/code_v1.0.5/prepare_premappedMatrix.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | import os
 5 | import numpy
 6 | import sys
 7 | from optparse import OptionParser
 8 | import subroutines
 9 | #
10 | #
11 | opts = OptionParser()
12 | usage = "Organize premapped data\nusage: %prog -s project --ref ref --fa chr.fa --bg bg.txt --meme motifs.meme --np 4"
13 | opts = OptionParser(usage=usage, version="%prog 1.0.5")
14 | opts.add_option("-s", help="The project folder")
15 | opts.add_option("--motif", default='yes', help="=yes if users want to caclulate motif bias, default=yes. "
16 |                 +"It's necessory for cell clustering by motifs (chromVAR) and searching differential motifs.")
17 | opts.add_option("--gene", default='no', help="=yes if users want to estimate gene score, default=no. "
18 |                 +"It's necessory for searching differential genes.")
19 | opts.add_option("--ref", default='hg19', help="Genome reference, default=hg19")
20 | opts.add_option("--fa", default='../reference/hg19_chr.fa', help="Genome fasta file, default=../reference/hg19_chr.fa")
21 | opts.add_option("--bg", default='../reference/tier1_markov1.norc.txt',
22 |                 help="Background file, default=../reference/tier1_markov1.norc.txt")
23 | opts.add_option("--pvalue", default=0.00005, help="P-value threshold for FIMO, default=0.00005")
24 | opts.add_option("--meme", default='../reference/JASPAR2018_CORE_vertebrates_redundant_pfms_meme.txt',
25 |                 help="Motif file, default=../reference/JASPAR2018_CORE_vertebrates_redundant_pfms_meme.txt")
26 | opts.add_option("--distal", default=20000, 
27 |                 help="distal region around TSS for peak searching, default=20000")
28 | opts.add_option("--np", default=1, help="Number of CPU cores to use, default=1")
29 | options, arguments = opts.parse_args()
30 | #
31 | #
32 | os.popen('cp '+options.s+'/data/cell_info.csv '+options.s+'/matrix/filtered_cells.csv')
33 | os.popen('cp '+options.s+'/peak/top_peaks.bed '+options.s+'/peak/top_filtered_peaks.bed')
34 | #
35 | if options.motif=='yes':
36 |     top_peaks = options.s + '/peak/top_filtered_peaks.bed'
37 |     print '!!!!!!  get transposase bias by GC content  !!!!!!'
38 |     trans_bias = options.s + '/peak/transposase_bias_filtered.bed'
39 |     temp02_file = options.s + '/peak/temp02.bed'
40 |     temp03_file = options.s + '/peak/temp03.bed'
41 |     with open(top_peaks) as annotate_file, open(temp02_file, 'w') as temp02:
42 |         for i, line in enumerate(annotate_file):
43 |             words = line.split('\t')
44 |             leave = words[0:3]
45 |             print >> temp02, '\t'.join(leave)
46 |     os.popen('bedtools nuc -fi ' + options.fa + ' -bed ' + temp02_file + ' > ' + temp03_file)
47 |     with open(temp03_file) as temp03, open(trans_bias, 'w') as bias:
48 |         for i, line in enumerate(temp03):
49 |             if i>0:
50 |                 words = line.split('\t')
51 |                 leave = words[0:3] + [words[4]]
52 |                 print >> bias, '\t'.join(leave)
53 |     print '!!!!!!  get bias done  !!!!!!'
54 | #
55 |     motif_folder = options.s + '/matrix/motif'
56 |     peaks_file = options.s + '/peak/top_filtered_peaks.bed'
57 |     if os.path.exists(motif_folder): os.popen('rm -rf ' + motif_folder)
58 |     os.popen('mkdir ' + motif_folder)
59 |     # run FIMO for motif-site searching
60 |     motifFasta = options.s + '/matrix/motif.fasta'
61 |     os.popen('bedtools getfasta -fi ' + options.fa + ' -bed ' + peaks_file + ' -fo ' + motifFasta)
62 |     subroutines.batch_fimo(options.bg, options.pvalue, options.meme, motifFasta, motif_folder, int(options.np))
63 |     # motif annotation
64 |     TFmatrix_file = options.s + '/matrix/motif_filtered.csv'
65 |     subroutines.score_peaks(peaks_file, motif_folder, TFmatrix_file)
66 | #
67 | #
68 | if options.gene=='yes':
69 |     gtf = '../reference/hg19_RefSeq_genes.gtf'
70 |     if options.ref=='mm10': gtf = '../reference/mm10_RefSeq_genes.gtf'
71 |     os.popen('python prepare_geneScore.py -s '+options.s+' --gtf '+gtf+' --distal '+str(options.distal))
72 | #
73 | #
74 | #
75 | 


--------------------------------------------------------------------------------
/code_v1.0.5/prepare_qualityControl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | import os
 5 | import numpy
 6 | import scipy.sparse
 7 | import scipy.io
 8 | import pandas
 9 | import sys
10 | import matplotlib
11 | matplotlib.use('Agg')
12 | import matplotlib.pyplot as plt
13 | from optparse import OptionParser
14 | #
15 | #
16 | opts = OptionParser()
17 | usage = "Build QC table\nusage: %prog -s project --pfrag 0.2 --lib 2000"
18 | opts = OptionParser(usage=usage, version="%prog 1.0.5")
19 | opts.add_option("-s", help="The project folder.")
20 | opts.add_option("--pfrag", default=0.2, help="Threshold for percentage of fragments in peaks, "
21 |                 +"default=0.2, decrease it for more filtered-samples, increase it for better quality")
22 | opts.add_option("--lib", default=2000, help="Threshold for fragment number, default=2000, "
23 |                 +"decrease it for more filtered-samples, increase it for better quality")
24 | options, arguments = opts.parse_args()
25 | #
26 | if not os.path.exists(options.s+'/figure'): os.popen('mkdir ' + options.s+'/figure')
27 | frag_thresh = float(options.pfrag)
28 | lib_size_thresh = int(options.lib)
29 | #
30 | reads_df = pandas.read_csv(options.s+'/matrix/reads.csv', sep=',', index_col=0,
31 |                    engine='c', na_filter=False, low_memory=False)
32 | reads = reads_df.values
33 | cell_info = pandas.read_csv(options.s+'/matrix/cell_info.merged.csv', sep='\t', index_col=0)
34 | cells_name = reads_df.index.values
35 | cell_info = cell_info.loc[cells_name]
36 | libSize = cell_info['final_reads'].values
37 | double = 1
38 | readsInPeaks = reads.sum(axis=1).astype(float)*double/cell_info['final_reads'].values
39 | cell_quality = numpy.vstack((libSize, readsInPeaks))
40 | cell_quality_df = pandas.DataFrame(cell_quality.T, index=cell_info.index.values, columns=['lib_size', 'frag_in_peak'])
41 | cell_quality_df.to_csv(options.s+'/matrix/cell_quality.csv', sep='\t')
42 | #
43 | plt.scatter(libSize+1, readsInPeaks, s=10)
44 | plt.vlines(lib_size_thresh, -0.05, 1.05, linestyles='dashed')
45 | plt.hlines(frag_thresh, 1e2, 1e7, linestyles='dashed')
46 | plt.xscale('log')
47 | plt.xlabel('final mapped reads')
48 | plt.ylabel('fragments in peaks(%)')
49 | plt.xlim(1e2, 1e7)
50 | plt.ylim(-0.05, 1.05)
51 | plt.savefig(options.s + '/figure/cell_quality.pdf')
52 | #
53 | cell_quality_df = cell_quality_df.loc[cell_quality_df['lib_size']>lib_size_thresh]
54 | cell_quality_df = cell_quality_df.loc[cell_quality_df['frag_in_peak']>frag_thresh]
55 | cell_names = cell_quality_df.index.values
56 | filtered_cells = cell_info.loc[cell_names, 'notes']
57 | filtered_cells = pandas.DataFrame(filtered_cells.values, index=filtered_cells.index, columns=['notes'])
58 | filtered_cells.to_csv(options.s+'/matrix/filtered_cells.csv', sep='\t')
59 | #
60 | drop_peaks = [peak for peak in reads_df.columns.values if len(numpy.where(reads_df[peak].values>0)[0])<3]
61 | reads_df = reads_df.drop(drop_peaks, axis=1)
62 | drop_index = [int(x[4:]) for x in drop_peaks]
63 | with open(options.s+'/peak/top_peaks.bed') as in_file, \
64 |      open(options.s+'/peak/top_filtered_peaks.bed', 'w') as out_file:
65 |     for iline,line in enumerate(in_file):
66 |         if iline not in drop_index:
67 |             print >> out_file, line[:-1]
68 | #
69 | motif_df = pandas.read_csv(options.s+'/matrix/motif_TF.csv', sep=',', index_col=0)
70 | motif_df = motif_df.loc[reads_df.columns.values]
71 | peak_num = [int(x[4:]) for x in reads_df.columns.values]
72 | bias = open(options.s+'/peak/transposase_bias.bed').readlines()
73 | with open(options.s+'/peak/transposase_bias_filtered.bed', 'w') as output:
74 |     for ipeak in peak_num:
75 |         print >> output, bias[ipeak][:-1]
76 | reads_df = reads_df.loc[cell_names]
77 | reads_df.to_csv(options.s+'/matrix/filtered_reads.csv', sep=',')
78 | scipy.io.mmwrite(options.s+'/matrix/filtered_reads.mtx', scipy.sparse.coo_matrix(reads_df.values))
79 | motif_df.to_csv(options.s+'/matrix/motif_filtered.csv', sep=',')
80 | print reads_df.shape, motif_df.shape
81 | #
82 | #
83 | 


--------------------------------------------------------------------------------
/code_v1.0.5/prepare_trimming.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import warnings
  3 | warnings.filterwarnings("ignore")
  4 | #
  5 | import sys
  6 | import os
  7 | from optparse import OptionParser
  8 | import Levenshtein
  9 | from multiprocessing import Pool
 10 | #
 11 | #
 12 | opts = OptionParser()
 13 | usage = "Trim adapter\nusage: %prog -s project --np 4"
 14 | opts = OptionParser(usage=usage, version="%prog 1.0")
 15 | opts.add_option("-s", help="The project path, should contains <data> folder, where the pair-end sequencing data locate. "
 16 |                           +"If you want to use this code to build cell_info.csv file, each fastq file should be names as:"
 17 |                           +"type1-001_1.fastq, type1-001_2.fastq, type1-002_1.fastq, type1-002_2.fastq, ... ;"
 18 |                           +"type2-001_1.fastq, type2-001_2.fastq, type2-002_1.fastq, type2-002_2.fastq, ... ; etc. "
 19 |                           +"{type1, type2, ..., typeN} can be cell-types for your samples, such as {GM, K562, ...}, "
 20 |                           +"or you can just use any name you want, but make sure there is no underline(_) or dashline(-) in typeX.")
 21 | opts.add_option("--qlen", default=20, help="Query length for adatper trimming, default=20.")
 22 | opts.add_option("--aseq", default="CTGTCTCTTATACACATCTGACGCTGCCGACGA", help="Adapter sequence, "
 23 |                           +"default=CTGTCTCTTATACACATCTGACGCTGCCGACGA.")
 24 | opts.add_option("--np", default=1, help="Number of CPUs used for trimming in parallel, default=1.")
 25 | options, arguments = opts.parse_args()
 26 | #
 27 | #
 28 | global query_length, adatper_seq
 29 | query_length = options.qlen
 30 | adapter_seq = options.aseq
 31 | #
 32 | #
 33 | def mismatch_align(seq1, query_length, read2_rc):
 34 |     for s1 in range(len(seq1)-query_length+1, -1, -1):
 35 |         temp_read1 = seq1[s1:(s1+query_length)]
 36 |         editdist = Levenshtein.distance(temp_read1, read2_rc)
 37 |         if editdist<2:
 38 |             return s1
 39 |     return -1
 40 | #
 41 | #
 42 | def rev_comp_dna(read2_rc):
 43 |     temp_read2 = ''
 44 |     for i in range(len(read2_rc)-1, -1, -1):
 45 |         if (read2_rc[i]=='A') | (read2_rc[i]=='a') :
 46 |             temp_read2 += 'T'
 47 |         elif (read2_rc[i]=='C') | (read2_rc[i]=='c') :
 48 |             temp_read2 += 'G'
 49 |         elif (read2_rc[i]=='G') | (read2_rc[i]=='g') :
 50 |             temp_read2 += 'C'
 51 |         elif (read2_rc[i]=='T') | (read2_rc[i]=='t') :
 52 |             temp_read2 += 'A'
 53 |         elif read2_rc[i]=='N':
 54 |             temp_read2 += 'N'
 55 |         else:
 56 |             return 'error'
 57 |     return temp_read2
 58 | #
 59 | #
 60 | def trim_adapters(fastq):
 61 |     cutoff = 50
 62 |     fastq1, fastq2 = fastq + '_1.fastq', fastq + '_2.fastq'
 63 |     trimed1, trimed2 = fastq + '_1.trim.fastq', fastq + '_2.trim.fastq'
 64 |     with open(fastq1) as fa1, open(fastq2) as fa2, open(trimed1, 'w') as out1, open(trimed2, 'w') as out2 :
 65 |         nReads, mm0_num_read, mm1_num_read = 0, 0, 0
 66 |         while 1:
 67 |             seq_header1, seq_header2 = fa1.readline()[:-1], fa2.readline()[:-1]
 68 |             seq1, seq2 = fa1.readline()[:-1], fa2.readline()[:-1]
 69 |             qual_header1, qual_header2 = fa1.readline()[:-1], fa2.readline()[:-1]
 70 |             qual1, qual2 = fa1.readline()[:-1], fa2.readline()[:-1]
 71 |             nReads += 1
 72 |             if ((not seq_header1) | (not seq_header2) | (not seq1) | (not seq2) |
 73 |                 (not qual_header1) | (not qual_header2) | (not qual1) | (not qual2)): break
 74 |             read2_rc = seq2[:query_length]
 75 |             read2_rc = rev_comp_dna(read2_rc)
 76 |             s1_pos = -1
 77 |             s1_pos_find = seq1.rfind(read2_rc)
 78 |             if s1_pos_find > 0 :
 79 |                 s1_pos = s1_pos_find
 80 |                 mm0_num_read += 1
 81 |             else:
 82 |                 s1_pos = mismatch_align(seq1, query_length, read2_rc)
 83 |                 if s1_pos>0: mm1_num_read += 1
 84 |             if s1_pos >= 0 :
 85 |                 seq_len = s1_pos + query_length
 86 |                 trim_seq1 = seq1[seq_len:]
 87 |                 adapter_trim_seq = adapter_seq[:len(trim_seq1)]
 88 |                 if adapter_trim_seq==trim_seq1:
 89 |                     seq1 = seq1[:seq_len]
 90 |                     seq2 = seq2[:seq_len]
 91 |                     qual1 = qual1[:seq_len]
 92 |                     qual2 = qual2[:seq_len]
 93 |             print >> out1, seq_header1
 94 |             print >> out1, seq1[:cutoff]
 95 |             print >> out1, qual_header1
 96 |             print >> out1, qual1[:cutoff]
 97 |             print >> out2, seq_header2
 98 |             print >> out2, seq2[:cutoff]
 99 |             print >> out2, qual_header2
100 |             print >> out2, qual2[:cutoff]
101 |     return nReads, mm0_num_read, mm1_num_read
102 | #
103 | #
104 | fastqs = [x.split('_')[0] for x in os.listdir(options.s+'/data/') if (x[-6:]=='.fastq')&(x[-11:]!='.trim.fastq')]
105 | fastqs = list(set(fastqs))
106 | fastqs.sort()
107 | pathes = [options.s+'/data/'+x for x in fastqs]
108 | pool = Pool(int(options.np))
109 | read_info = pool.map(trim_adapters, pathes)
110 | pool.close()
111 | pool.join()
112 | #
113 | with open(options.s+'/data/cell_info.csv', 'w') as output:
114 |     print >> output, 'name\tnotes'
115 |     for fastq in fastqs:
116 |         print >> output, fastq + '\t' + '-'.join(fastq.split('-')[:-1])
117 | #
118 | #
119 | #
120 | #
121 | 


--------------------------------------------------------------------------------
/code_v1.0.5/run_monocle.R:
--------------------------------------------------------------------------------
 1 | #
 2 | library('monocle')
 3 | args <- commandArgs(T)
 4 | #
 5 | ####### build CellDataSet type object ##########
 6 | #
 7 | expr_matrix <- read.csv(args[1], header=TRUE, row.names=1, sep=',', check.names=FALSE)
 8 | cells <- read.delim(args[2], row.names=1)
 9 | genes <- read.delim(args[3], row.names=1)
10 | pd <- new("AnnotatedDataFrame", data=cells)
11 | fd <- new("AnnotatedDataFrame", data=genes)
12 | HSMM <- newCellDataSet(as.matrix(expr_matrix), phenoData=pd, featureData=fd, expressionFamily=negbinomial.size())
13 | #HSMM <- detectGenes(HSMM, min_expr=0.001)
14 | #expressed_genes <- row.names(subset(fData(HSMM), num_cells_expressed >= 4))
15 | #HSMM <- HSMM[expressed_genes, ]
16 | HSMM <- estimateSizeFactors(HSMM)
17 | #
18 | #
19 | ####### reduce dimension ##########
20 | #
21 | HSMM <- reduceDimension(HSMM, max_components=3, norm_method='none', num_dim=20, reduction_method='DDRTree', verbose=T)
22 | HSMM <- orderCells(HSMM)
23 | write.csv(HSMM@reducedDimS, args[4])
24 | write.csv(HSMM@phenoData@data, args[5])
25 | #
26 | #
27 | #
28 | 


--------------------------------------------------------------------------------
/code_v1.0.5/subroutines.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuKunLab/APEC/e15d6c42e05b2deb9c6b89f678905c6ef9878e2b/code_v1.0.5/subroutines.pyc


--------------------------------------------------------------------------------
/code_v1.0.6/APEC_prepare_steps.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | #
 3 | #### input parameters defined by users #############################################
 4 | #
 5 | ARGS=`getopt -o hs:g:n:l:p:f: -l help,project:,genome:,np:,logq:,pfrag:,frag: -- "$@"`
 6 | if [ $? != 0 ] ; then echo "Terminating..." >&2 ; exit 1 ; fi
 7 | eval set -- "$ARGS"
 8 | while true ; do
 9 |     case "$1" in
10 |         -h|--help)
11 |            echo "
12 | bash APEC_prepare_steps.sh -s project -g genome_index -n nCPUs -l logq -p pfrag -f frag
13 |      -s/--project:  The project path, which should contain <data> folder before running APEC.
14 |      -g/--genome:   hg19 or mm10.
15 |      -n/--np:       Number of CPU cores.
16 |      -l/--logq:     Threshold for the -log(Q-value) of peaks, used to filter peaks.
17 |      -p/--pfrag:    Threshold of the percentage of fragments in peaks, used to filter cells.
18 |      -f/--frag:     Threshold of the fragment number of each cell, used to filter cells."
19 |            exit 1 ;;
20 |         -s|--project) project="$2" ; shift 2;;
21 |         -g|--genome) genome="$2" ; shift 2;;
22 |         -n|--np) np="$2" ; shift 2;;
23 |         -l|--logq) logq="$2" ; shift 2;;
24 |         -p|--pfrag) pfrag="$2" ; shift 2;;
25 |         -f|--frag) frag="$2" ; shift 2;;
26 |         --) shift; break ;;
27 |         *) echo "unknown parameter: {$1}" ; exit 1 ;;
28 |     esac
29 | done
30 | #
31 | picard=../reference/picard.jar
32 | ref=$genome
33 | fa="../reference/"$genome"_chr.fa"
34 | index="../reference/"$genome
35 | tss="../reference/"$genome"_refseq_genes_TSS.txt"
36 | if [[ $genome == "hg19" ]]; then
37 |     blist=../reference/hg19_blacklist.JDB.bed
38 | elif [[ $genome == "mm10" ]]; then
39 |     blist=../reference/mm10_blacklist.BIN.bed
40 | fi
41 | gtf="../reference/"$genome"_RefSeq_genes.gtf"
42 | np=$np
43 | logq=$logq
44 | pfrag=$pfrag
45 | frag=$frag
46 | #
47 | #
48 | #
49 | #### processes to prepare raw data ###########
50 | #
51 | python prepare_trimming.py -s $project --np $np
52 | #
53 | python prepare_mapping.py -s $project --index $index --picard $picard --tss $tss --np $np
54 | #
55 | python prepare_peakCalling.py -s $project --blist $blist --fa $fa --tss $tss --ref $ref --logq $logq
56 | #
57 | python prepare_countMatrix.py -s $project --fa $fa --np $np
58 | #
59 | python prepare_qualityControl.py -s $project --pfrag $pfrag --lib $frag
60 | #
61 | #python prepare_geneScore.py -s $project --gtf $gtf
62 | #
63 | #
64 | #
65 | #
66 | 


--------------------------------------------------------------------------------
/code_v1.0.6/Bias_corrected_deviation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | import numpy
  4 | import pandas
  5 | #import numba
  6 | from multiprocessing import Pool
  7 | import scipy.sparse
  8 | #
  9 | #
 10 | global GC_bias, peak_reads, nStep, sd
 11 | #
 12 | #
 13 | def mahalanobis_transform(aMatrix):
 14 |     (nSample, nFeature) = aMatrix.shape
 15 |     am_sum = aMatrix.sum(axis=0)
 16 |     ajaMatrix = numpy.zeros((nFeature, nFeature))
 17 |     for i in range(0, nFeature):
 18 |         for j in range(0, nFeature):
 19 |             ajaMatrix[i,j] = am_sum[i] * am_sum[j]
 20 |     SaMatrix = numpy.dot(aMatrix.T, aMatrix) / float(nSample) - ajaMatrix / float(nSample**2)
 21 |     value, vector = numpy.linalg.eig(SaMatrix)
 22 |     value_inverseRoot = numpy.diag(-abs(value)**0.5)
 23 |     Sa_inverseRoot = numpy.dot(numpy.dot(vector, value_inverseRoot), vector.T)
 24 |     aMatrix_ave = aMatrix.mean(axis=0)
 25 |     aMatrix_ave = numpy.array([aMatrix_ave for i in range(0, nSample)])
 26 |     zMatrix = numpy.dot(Sa_inverseRoot, (aMatrix.T - aMatrix_ave.T))
 27 |     return zMatrix.T
 28 | #
 29 | #
 30 | #@numba.jit()
 31 | def single_sampling(par):
 32 |     GC_bias, peak_reads, nStep, sd, iIter = par[0], par[1], par[2], par[3], par[4]
 33 |     numpy.random.seed(12345+iIter)
 34 |     bias_step = (GC_bias.max() - GC_bias.min()) / float(nStep)
 35 |     read_step = (peak_reads.max() - peak_reads.min()) / float(nStep)
 36 |     sample = numpy.zeros(len(GC_bias),dtype=numpy.int)
 37 |     for ibias,bias_i in enumerate(GC_bias):
 38 |         bias_iIndex = int((bias_i - GC_bias.min()) // bias_step)
 39 |         read_iIndex = int((peak_reads[ibias] - peak_reads.min()) // read_step)
 40 |         bias_iIndex = min(nStep-1, max(0, bias_iIndex))
 41 |         read_iIndex = min(nStep-1, max(0, read_iIndex))
 42 |         peaks_inGrid = numpy.array([])
 43 |         ncount = 0
 44 |         while (len(peaks_inGrid)<=0) & (ncount<1000):
 45 |             ncount += 1
 46 |             bias_jIndex = bias_iIndex + int(numpy.rint(numpy.random.randn()*sd))
 47 |             read_jIndex = read_iIndex + int(numpy.rint(numpy.random.randn()*sd))
 48 |             while (bias_jIndex<0)|(bias_jIndex>=nStep):
 49 |                 bias_jIndex = bias_iIndex + int(numpy.rint(numpy.random.randn()*sd))
 50 |             while (read_jIndex<0)|(read_jIndex>=nStep):
 51 |                 read_jIndex = read_iIndex + int(numpy.rint(numpy.random.randn()*sd))
 52 |             bias_jStart = GC_bias.min() + bias_jIndex * bias_step
 53 |             read_jStart = peak_reads.min() + read_jIndex * read_step
 54 |             bias_jBin = numpy.where((bias_jStart<=GC_bias)&(GC_bias<bias_jStart+bias_step))[0]
 55 |             read_jBin = numpy.where((read_jStart<=peak_reads)&(peak_reads<read_jStart+read_step))[0]
 56 |             peaks_inGrid = numpy.intersect1d(bias_jBin, read_jBin)
 57 |         if ncount<1000:
 58 |             sample[ibias] = numpy.random.choice(peaks_inGrid)
 59 |         else:
 60 |             sample[ibias] = ibias
 61 |     print('permuted sampling ', iIter, ' done!')
 62 |     return sample
 63 | #
 64 | #
 65 | def batch_sampling(GC_bias, peak_reads, nStep, sd, nIteration, np):
 66 |     matrix = numpy.vstack((GC_bias, peak_reads)).T
 67 |     matrix = mahalanobis_transform(matrix)
 68 |     GC_bias, peak_reads = matrix.T[0,:], matrix.T[1,:]
 69 |     kIterations = numpy.arange(0, nIteration, 1, dtype=int)
 70 |     parameters = []
 71 |     for iIter in kIterations:
 72 |         parameters.append([GC_bias, peak_reads, nStep, sd, iIter])
 73 | #
 74 | #    samples = []
 75 | #    for i in kIterations:
 76 | #        par = [GC_bias, peak_reads, nStep, sd, i]
 77 | #        sample = single_sampling(par)
 78 | #        samples.append(sample)
 79 |     pool = Pool(np)
 80 |     samples = pool.map(single_sampling, parameters)
 81 |     pool.close()
 82 |     pool.join()
 83 |     return numpy.array(samples)
 84 | #
 85 | #
 86 | #@numba.jit(nopython=True)
 87 | def expected_matrix(reads, TFmotif, GC_bias):
 88 |     (nCell, nPeak) = reads.shape   # X_matrix
 89 |     (nTF, nPeak) = TFmotif.shape   # M_matrix
 90 |     E_matrix = numpy.zeros(reads.shape)
 91 |     reads_sum = reads.sum()
 92 |     for iCell in range(0, nCell):
 93 |         reads_iCell = reads[iCell, :].sum()
 94 |         for jPeak in range(0, nPeak):
 95 |             E_matrix[iCell, jPeak] = reads[:, jPeak].sum() * reads_iCell / reads_sum
 96 |     return E_matrix
 97 | #
 98 | #
 99 | #
100 | def deviation(MM, BB, XX, EE):
101 |     MM = numpy.dot(MM, BB.T)
102 |     YY = numpy.dot(MM, XX.T) - numpy.dot(MM, EE.T)
103 |     denom = numpy.dot(MM, EE.T)
104 |     YY = YY / denom
105 |     return YY
106 | #
107 | #
108 | def raw_deviation(TFmotif, reads, expected):
109 |     (nCell, nPeak) = reads.shape
110 |     (nTF, nPeak) = TFmotif.shape
111 |     B_matrix = numpy.diag(numpy.ones(nPeak))
112 |     raw_dev = deviation(TFmotif, B_matrix, reads, expected)
113 |     return raw_dev
114 | #
115 | #
116 | #
117 | #
118 | #
119 | #
120 | #
121 | 


--------------------------------------------------------------------------------
/code_v1.0.6/cluster_byMotif.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import warnings
  3 | warnings.filterwarnings("ignore")
  4 | #
  5 | import numpy,scipy.io,pandas,sys,os,scipy.sparse,time
  6 | import subroutines
  7 | import Bias_corrected_deviation
  8 | from optparse import OptionParser
  9 | from multiprocessing import Pool
 10 | import matplotlib
 11 | matplotlib.use('Agg')
 12 | import matplotlib.pyplot as plt
 13 | import seaborn
 14 | from MulticoreTSNE import MulticoreTSNE as McTSNE
 15 | from sklearn.neighbors import kneighbors_graph
 16 | #
 17 | #
 18 | global TFmotif, samples, reads, expected, TFnames, cell_names, GC_bias
 19 | #
 20 | opts = OptionParser()
 21 | usage = "Cluster by ChromVAR\nusage: %prog -s project --np 8"
 22 | opts = OptionParser(usage=usage, version="%prog 1.0.6")
 23 | opts.add_option("-s", help="The project folder.")
 24 | opts.add_option("--ns", default=50, help="Number of permuted samplings, default=50")
 25 | opts.add_option("--np", default=1, help="CPU cores used for samplings, default=1")
 26 | opts.add_option("--nc", default=0, help="Number of cell clusters, default=0, i.e. predicted by Louvain algorithm")
 27 | opts.add_option("--hc", default='no', help="Run hierarchical clustering or not, default=no.")
 28 | opts.add_option("--rs", default=1, help="Random state of tSNE analysis, default=1")
 29 | options, arguments = opts.parse_args()
 30 | #
 31 | #
 32 | if not os.path.exists(options.s+'/result'): os.popen('mkdir ' + options.s+'/result')
 33 | if not os.path.exists(options.s+'/figure'): os.popen('mkdir ' + options.s+'/figure')
 34 | #
 35 | #
 36 | def initiation(options):
 37 |     reads = scipy.io.mmread(options.s+'/matrix/filtered_reads.mtx')
 38 |     reads = scipy.sparse.csr_matrix(reads).T
 39 |     reads = reads.A + 0.0001
 40 |     cells = pandas.read_csv(options.s+'/matrix/filtered_cells.csv', sep='\t', index_col=0,
 41 |                    engine='c', na_filter=False, low_memory=False)
 42 |     cell_names = cells.index.values
 43 |     TFmotif_df = pandas.read_csv(options.s+'/matrix/motif_filtered.csv', sep=',', index_col=0)
 44 |     TFmotif_origin = TFmotif_df.values.T
 45 |     TFnames = TFmotif_df.columns.values
 46 |     print('read-counts matrix:', reads.shape)
 47 |     print('TFmotif matrix:', TFmotif_origin.shape)
 48 |     GC_bias = numpy.array([float(x.split()[3]) for x in open(options.s+'/peak/transposase_bias_filtered.bed').readlines()])
 49 |     TFmotif = numpy.asarray([x for x in TFmotif_origin if x.sum() > 0])
 50 |     TFmotif[numpy.where(TFmotif > 0)] = 1
 51 |     TFnames = [x for i,x in enumerate(TFnames) if TFmotif_origin[i, :].sum() > 0]
 52 |     return TFmotif, reads, TFnames, cell_names, GC_bias
 53 | #
 54 | #
 55 | def permuted_sampling(options):
 56 |     peak_reads = numpy.log10(reads.sum(axis=0)+1.0)
 57 |     ngrid, std = 50, 1
 58 |     print(GC_bias.shape, peak_reads.shape)
 59 |     samples = Bias_corrected_deviation.batch_sampling(GC_bias, peak_reads, ngrid, std, int(options.ns), int(options.np))
 60 |     print('permuted sampling done!')
 61 |     return samples
 62 | #
 63 | #
 64 | def raw_deviation(options):
 65 |     expected = Bias_corrected_deviation.expected_matrix(reads, TFmotif, GC_bias)
 66 |     raw_dev = Bias_corrected_deviation.raw_deviation(TFmotif, reads, expected)
 67 |     numpy.savetxt(options.s+'/result/raw_deviation.txt', raw_dev)
 68 |     print('raw deviation done!')
 69 |     return expected, raw_dev
 70 | #
 71 | #
 72 | def background_deviation(iIter):
 73 |     numpy.random.seed(12345+iIter)
 74 |     (nCell, nPeak) = reads.shape
 75 |     (nTF, nPeak) = TFmotif.shape
 76 |     background_dev = numpy.zeros((nTF, nCell))
 77 |     B_matrix = numpy.zeros((nPeak, nPeak))
 78 |     for iPeak in range(0, nPeak):
 79 |         B_matrix[iPeak, int(samples[iIter, iPeak])] = 1
 80 |     background_dev = Bias_corrected_deviation.deviation(TFmotif, B_matrix, reads, expected)
 81 |     print('background deviation for sample '+str(iIter+1)+' done!')
 82 |     return background_dev
 83 | #
 84 | #
 85 | def corrected_deviation(options):
 86 |     kIterations = numpy.arange(0, int(options.ns), 1, dtype=int)
 87 |     pool = Pool(int(options.np))
 88 |     bg_dev = pool.map(background_deviation, kIterations)
 89 |     pool.close()
 90 |     pool.join()
 91 |     bg_dev = numpy.array(bg_dev)
 92 |     print('background deviations done!')
 93 |     bg_dev_mean = bg_dev.mean(axis=0)
 94 |     bg_dev_std = bg_dev.std(axis=0)
 95 |     raw_dev = numpy.loadtxt(options.s+'/result/raw_deviation.txt')
 96 |     corrected_dev = (raw_dev - bg_dev_mean) / bg_dev_std
 97 |     dev_df = pandas.DataFrame(corrected_dev, index=TFnames, columns=cell_names)
 98 |     dev_df.to_csv(options.s+'/result/deviation_chromVAR.csv', sep=',')
 99 |     return
100 | #
101 | #
102 | def cell_cluster(options):
103 |     reads_df = pandas.read_csv(options.s+'/result/deviation_chromVAR.csv', sep=',', index_col=0,
104 |                    engine='c', na_filter=False, low_memory=False)
105 |     matrix = reads_df.T
106 |     connect = kneighbors_graph(matrix, n_neighbors=20, include_self=False)
107 |     connectivity = 0.5*(connect + connect.T)
108 |     if int(options.nc)==0:
109 |         n_clust, clusters = subroutines.predict_cluster(matrix, connectivity.todense())
110 |         print("predicted number of cell-clusters: ", n_clust)
111 |         clusters.to_csv(options.s+'/result/louvain_cluster_by_chromVAR.csv', sep='\t')
112 |         tsne_result = McTSNE(n_components=2, random_state=int(options.rs)).fit_transform(matrix.values)
113 |         subroutines.plot_cluster(options, clusters, n_clust, tsne_result, 'louvain_cluster_by_chromVAR.pdf')
114 |     else:
115 |         n_clust = int(options.nc)
116 |         clusters = subroutines.knn_cluster(options, matrix, n_clust, connectivity, "KNN_cluster_by_chromVAR.csv")
117 |         tsne_result = McTSNE(n_components=2, random_state=int(options.rs)).fit_transform(matrix.values)
118 |         subroutines.plot_cluster(options, clusters, n_clust, tsne_result, 'KNN_cluster_by_chromVAR.pdf')
119 | #
120 |     subroutines.plot_tSNE(options, matrix, tsne_result, "TSNE_by_chromVAR.pdf")
121 |     tsne_df = pandas.DataFrame(tsne_result, index=matrix.index, columns=['TSNE1', 'TSNE2'])
122 |     tsne_df.to_csv(options.s+'/result/TSNE_by_chromVAR.csv', sep='\t')
123 | #
124 |     if options.hc=='yes':
125 |         subroutines.hierarchy_cluster(options, reads_df.corr(), n_clust, "cell_cell_correlation_by_chromVAR.png",
126 |                                   "Hierarchical_cluster_by_chromVAR.csv")
127 |     return
128 | #
129 | #
130 | TFmotif, reads, TFnames, cell_names, GC_bias = initiation(options)
131 | samples = permuted_sampling(options)
132 | expected, raw_dev = raw_deviation(options)
133 | corrected_deviation(options)
134 | cell_cluster(options)
135 | #
136 | #
137 | #
138 | #
139 | #
140 | 


--------------------------------------------------------------------------------
/code_v1.0.6/cluster_comparison.py:
--------------------------------------------------------------------------------
 1 | import numpy, pandas
 2 | from optparse import OptionParser
 3 | import scipy.special
 4 | import sklearn.metrics
 5 | from sklearn.metrics.cluster import contingency_matrix
 6 | #
 7 | #
 8 | pts = OptionParser()
 9 | usage = "Compare clustering method\nusage: %prog --c1 cell_info.csv --c2 KNN_cluster.csv"
10 | opts = OptionParser(usage=usage, version="%prog 1.0.6")
11 | opts.add_option("--c1", help="filtered_cells.csv in <matrix> folder, or cluster_by_XXX.csv in <result> folder")
12 | opts.add_option("--c2", help="cell cluster file different with c1")
13 | options, arguments = opts.parse_args()
14 | #
15 | #
16 | cluster_df1 = pandas.read_csv(options.c1, sep='\t', index_col=0)
17 | cluster_df2 = pandas.read_csv(options.c2, sep='\t', index_col=0)
18 | if 'notes' in cluster_df1.columns.values: cluster_df1['cluster']=cluster_df1['notes']
19 | if 'notes' in cluster_df2.columns.values: cluster_df2['cluster']=cluster_df2['notes']
20 | clusters1 = cluster_df1['cluster'].values
21 | clusters2 = cluster_df2['cluster'].values
22 | ari = sklearn.metrics.adjusted_rand_score(clusters1, clusters2)
23 | nmi = sklearn.metrics.mutual_info_score(clusters1, clusters2)
24 | ami = sklearn.metrics.adjusted_mutual_info_score(clusters1, clusters2)
25 | cont_mat = contingency_matrix(clusters1, clusters2)
26 | print('ARI=', ari)
27 | print('NMI=', nmi)
28 | print('AMI=', ami)
29 | print('contingency matrix:')
30 | print(cont_mat)
31 | #
32 | #
33 | #
34 | #
35 | #
36 | 


--------------------------------------------------------------------------------
/code_v1.0.6/generate_UCSCtrack.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | #
 5 | import os
 6 | import numpy
 7 | import pandas
 8 | from optparse import OptionParser
 9 | #
10 | #
11 | opts = OptionParser()
12 | usage = "Merge counts of cells to track files\nusage: %prog -s project --cfile cluster.csv --gsize chrom.sizes"
13 | opts = OptionParser(usage=usage, version="%prog 1.0")
14 | opts.add_option("-s", help="The project folder.")
15 | opts.add_option("--cfile", help="cluster.csv file, e.g. louvain_cluster_by_APEC.csv in <result> folder")
16 | opts.add_option("--gsize", default='../reference/hg19.chrom.sizes', help="chrom.size files, default=../reference/hg19.chrom.sizes")
17 | options, arguments = opts.parse_args()
18 | #
19 | #
20 | def merge_bam(options):
21 |     bam_folder = [x for x in os.listdir(options.s+'/work')]
22 |     bam_folder.sort()
23 |     cell_df = pandas.read_csv(options.cfile, sep='\t', index_col=0)
24 |     if 'cell_info' in options.cfile: cell_df['cluster'] = cell_df['notes']
25 |     cell_types = cell_df['cluster'].values
26 |     cell_types = list(set(cell_types))
27 |     for cell_type in cell_types:
28 |         marked_bam, select = [], []
29 |         for folder in bam_folder:
30 |             path = options.s + '/work/' + folder + '/'
31 |             if folder in cell_df.index.values:
32 |                 if cell_df.ix[folder, 'cluster']==cell_type:
33 |                     select.append(folder)
34 |                     marked_bam.extend([path + x for x in os.listdir(path) if x[-10:]=='marked.bam'])
35 |         marked_bam = ' '.join(marked_bam)
36 |         merged_bam = options.s + '/result/track/' + str(cell_type) + '.bam'
37 |         os.popen('samtools merge -f ' + merged_bam + ' ' + marked_bam)
38 |         os.popen('samtools index ' + merged_bam)
39 |     return
40 | #
41 | #
42 | def bam2bw(options):
43 |     cell_df = pandas.read_csv(options.cfile, sep='\t', index_col=0)
44 |     if 'cell_info' in options.cfile: cell_df['cluster'] = cell_df['notes']
45 |     cell_types = cell_df['cluster'].values
46 |     cell_types = list(set(cell_types))
47 |     for cell_type in cell_types:
48 |         name = options.s+'/result/track/'+str(cell_type)
49 |         cells = cell_df.loc[cell_df['cluster']==cell_type]
50 |         cells = cells.index.values
51 |         os.popen('bedtools genomecov -bg -ibam '+name+'.bam -g '+options.gsize+' > '+name+'.bedgraph')
52 |         os.popen('bedtools sort -i '+name+'.bedgraph > '+name+'.sorted.bedgraph')
53 |         counts = numpy.array([int(x.split()[3]) for x in open(name+'.sorted.bedgraph').readlines()])
54 |         total = counts.sum()
55 |         with open(name+'.sorted.bedgraph') as infile, open(name+'.norm.bedgraph', 'w') as outfile:
56 |             for line in infile:
57 |                 words = line.split()
58 | #                words[3] = str(round(float(words[3]) * 1.0e7 / total))
59 |                 words[3] = str(round(float(words[3]) * 100.0 / len(cells)))
60 | #                print >> outfile, '\t'.join(words)
61 |                 outfile.write('\t'.join(words)+'\n')
62 |         os.popen('bedGraphToBigWig '+name+'.norm.bedgraph '+options.gsize+' '+name+'.bw')
63 |     return
64 | #
65 | #
66 | #
67 | os.popen('mkdir ' + options.s + '/result/track')
68 | merge_bam(options)
69 | bam2bw(options)
70 | #
71 | #
72 | #
73 | 


--------------------------------------------------------------------------------
/code_v1.0.6/generate_differential_Accesson.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | #
 5 | import numpy
 6 | import pandas
 7 | from optparse import OptionParser
 8 | from scipy import stats
 9 | import os
10 | import sys
11 | from multiprocessing import Pool
12 | import subroutines
13 | #
14 | #
15 | opts = OptionParser()
16 | usage = "Enriched accessons of a cluster (batch) \nusage: %prog -s project --cfile cluster.csv --cluster 1 --vs 2,3"
17 | opts = OptionParser(usage=usage, version="%prog 1.0")
18 | opts.add_option("-s", help="The project folder.")
19 | opts.add_option("--cfile", help="cluster.csv file of a clustering method, e.g. louvain_cluster_by_APEC.csv in result folder")
20 | opts.add_option("--cluster", help="the cluster(s) for specific-TF analysis, can be {0, 1, ..., nCluster}, or a batch of clusters like 0,2,3")
21 | opts.add_option("--vs", default='all', help="vs which cluster(s) to search specific TF for target cluster(s), e.g. 1,4,2, default=all")
22 | opts.add_option("--pvalue", default=0.001, help='P-value threshold for specific peaks, default=0.001')
23 | opts.add_option("--fold", default=2, help='Fold change cutoff of specific peaks, default=2')
24 | options, arguments = opts.parse_args()
25 | #
26 | #
27 | subname = '_'.join(options.cluster.split(','))
28 | if options.vs!='all':
29 |     subname += '_VS_' + '_'.join(options.vs.split(','))
30 | subroutines.specific_accesson(options, subname)
31 | #
32 | #
33 | 


--------------------------------------------------------------------------------
/code_v1.0.6/generate_differential_markers.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import warnings
  3 | warnings.filterwarnings("ignore")
  4 | #
  5 | import numpy
  6 | import pandas
  7 | from optparse import OptionParser
  8 | from scipy import stats
  9 | import os
 10 | import sys
 11 | from multiprocessing import Pool
 12 | import subroutines
 13 | #
 14 | #
 15 | opts = OptionParser()
 16 | usage = "Enriched motifs/genes/peaks of a cluster (batch)\nusage: %prog -s project --cfile cluster.csv --cluster 1 --vs 2,3"
 17 | opts = OptionParser(usage=usage, version="%prog 1.0")
 18 | opts.add_option("-s", help="The project folder.")
 19 | opts.add_option("--cfile", help="cluster.csv file of a clustering method, e.g. louvain_cluster_by_APEC.csv in result folder")
 20 | opts.add_option("--cluster", help="The cluster for differential markers analysis, can be {0, 1, ..., nCluster}, or a batch of clusters like 0,2,3")
 21 | opts.add_option("--vs", default='all', help="vs which clusters to search differential markers for target clusters, e.g. 1,4,2, default=all")
 22 | opts.add_option("--pvalue", default=0.001, help='P-value threshold for differential markers, default=0.001')
 23 | opts.add_option("--fold", default=2, help='Fold change cutoff of differential markers, default=2')
 24 | opts.add_option("--peak", default='no', help='Whether to search differential peaks for target cluster, default=yes.')
 25 | opts.add_option("--motif", default='no', help='Whether to search differential motifs for target cluster, default=no. '
 26 |                 +'Users need to run cluster_byMotif.py first')
 27 | opts.add_option("--gene", default='yes', help='Whether to search differential genes for target cluster, default=yes.')
 28 | options, arguments = opts.parse_args()
 29 | #
 30 | #
 31 | def group_cells(options):
 32 | #    deviation = pandas.read_csv(options.s+'/result/deviation_chromVAR.csv', sep=',', index_col=0,
 33 | #                   engine='c', na_filter=False, low_memory=False)
 34 |     cells_df = pandas.read_csv(options.s+'/matrix/filtered_cells.csv', sep='\t', index_col=0,
 35 |                    engine='c', na_filter=False, low_memory=False)
 36 |     cluster_df = pandas.read_csv(options.cfile, sep='\t', index_col=0)
 37 |     kCluster = options.cluster.split(',')
 38 |     if options.vs!='all': vsCluster = options.vs.split(',')
 39 |     if 'cluster' not in cluster_df.columns.values:
 40 |         cluster_df['cluster'] = cluster_df['notes']
 41 |     else:
 42 |         kCluster = map(int, kCluster)
 43 |         if options.vs!='all': vsCluster = map(int, vsCluster)
 44 |     if options.vs=='all': vsCluster = list(set(cluster_df['cluster'].values)-set(kCluster))
 45 |     cluster_df = cluster_df.loc[cells_df.index.values]
 46 |     cell_inCluster = cluster_df.loc[cluster_df['cluster'].isin(kCluster)].index.values
 47 |     cell_outCluster = cluster_df.loc[cluster_df['cluster'].isin(vsCluster)].index.values
 48 |     print(len(cell_inCluster), len(cell_outCluster))
 49 |     return cell_inCluster, cell_outCluster
 50 | #
 51 | #
 52 | def get_diff_motifs(options, subname, cell_inCluster, cell_outCluster):
 53 |     deviation = pandas.read_csv(options.s+'/result/deviation_chromVAR.csv', sep=',', index_col=0,
 54 |                    engine='c', na_filter=False, low_memory=False)
 55 |     dev_in, dev_out = deviation[cell_inCluster].values, deviation[cell_outCluster].values
 56 |     mean_in, mean_out = dev_in.mean(axis=1), dev_out.mean(axis=1)
 57 |     delta = mean_in - mean_out
 58 |     ttest, pvalues = stats.ttest_ind(dev_in.T, dev_out.T, equal_var=False)
 59 |     matrix = numpy.array([mean_in, mean_out, delta, pvalues]).T
 60 |     columns = ['dev_inCluster', 'dev_outCluster', 'd_dev', 'P-value']
 61 |     compare_df = pandas.DataFrame(matrix, index=deviation.index, columns=columns)
 62 |     compare_df = compare_df.loc[compare_df['d_dev']>=1]
 63 |     compare_df = compare_df.loc[compare_df['P-value']<=float(options.pvalue)]
 64 |     compare_df = compare_df.sort_values(by=['P-value'])
 65 |     compare_df.to_csv(options.s+'/result/motifs_of_cluster_'+subname+'.csv', sep='\t')
 66 |     return
 67 | #
 68 | #
 69 | def get_diff_genes(options, subname, cell_inCluster, cell_outCluster):
 70 |     expr = pandas.read_csv(options.s+'/matrix/gene_score.csv', sep=',', index_col=0,
 71 |                    engine='c', na_filter=False, low_memory=False).T
 72 |     dev_in, dev_out = expr[cell_inCluster].values, expr[cell_outCluster].values
 73 |     mean_in, mean_out = dev_in.mean(axis=1), dev_out.mean(axis=1)
 74 |     delta = (mean_in + 1e-4) / (mean_out + 1e-4)
 75 |     ttest, pvalues = stats.ttest_ind(dev_in.T, dev_out.T, equal_var=False)
 76 |     matrix = numpy.array([mean_in, mean_out, delta, pvalues]).T
 77 |     columns = ['expr_inCluster', 'expr_outCluster', 'fold', 'P-value']
 78 |     compare_df = pandas.DataFrame(matrix, index=expr.index, columns=columns)
 79 |     compare_df = compare_df.loc[compare_df['fold']>=float(options.fold)]
 80 |     compare_df = compare_df.loc[compare_df['P-value']<=float(options.pvalue)]
 81 |     compare_df = compare_df.sort_values(by=['P-value'])
 82 |     compare_df.to_csv(options.s+'/result/genes_of_cluster_'+subname+'.csv', sep='\t')
 83 |     return
 84 | #
 85 | #
 86 | subname = '_'.join(options.cluster.split(','))
 87 | if options.vs!='all':
 88 |     subname += '_VS_' + '_'.join(options.vs.split(','))
 89 | if options.peak=='yes':
 90 |     subroutines.specific_peak(options, subname)
 91 | cells_in, cells_out = group_cells(options)
 92 | if options.motif=='yes':
 93 |     get_diff_motifs(options, subname, cells_in, cells_out)
 94 | if options.gene=='yes':
 95 |     get_diff_genes(options, subname, cells_in, cells_out)
 96 | #
 97 | #
 98 | #
 99 | #
100 | 


--------------------------------------------------------------------------------
/code_v1.0.6/generate_gene_score_by_accesson.py:
--------------------------------------------------------------------------------
 1 | import os,numpy,pandas,time,numba,scipy.stats
 2 | from optparse import OptionParser
 3 | #
 4 | #
 5 | opts = OptionParser()
 6 | usage = "Generate gene score from accesson\nusage: %prog -s project --genome hg19"
 7 | opts = OptionParser(usage=usage, version="%prog 1.0")
 8 | opts.add_option("-s", help="The project folder.")
 9 | opts.add_option("--genome", default='hg19', help="Genome reference for gene annotation, can be hg19 or mm10, default=hg19")
10 | opts.add_option("--width", default=1000000, help="Width of genome window for fisher exact test, default=1000000")
11 | opts.add_option("--pvalue", default=0.01, help='P-value threshold for fisher exact test, default=0.01')
12 | options, arguments = opts.parse_args()
13 | #
14 | #
15 | def annotate_peak(project, genome='hg19'):
16 |     peaks_bed = project + '/peak/top_filtered_peaks.bed'
17 |     annotated = project + '/peak/top_annotated_peaks.bed'
18 |     os.popen('annotatePeaks.pl '+peaks_bed+'  '+genome+' -strand both -size given > '+annotated)
19 |     return
20 | #
21 | #
22 | def annotate_accessons(project, width=1e6, pvalue=0.01):
23 |     peak_bed = project + '/peak/top_filtered_peaks.bed'
24 |     accesson_csv = project + '/matrix/Accesson_peaks.csv'
25 |     annotated_bed = project + '/peak/top_annotated_peaks.bed'
26 |     peak_list = [[x.split()[0], (int(x.split()[1])+int(x.split()[2]))//2] for x in open(peak_bed).readlines()]
27 |     peak_index = ['peak'+str(x) for x in range(0, len(peak_list))]
28 |     peak_df = pandas.DataFrame(peak_list, index=peak_index, columns=['chromosome', 'base'])
29 |     annotated_df = pandas.read_csv(annotated_bed, sep='\t', index_col=0,
30 |                                    engine='c', na_filter=False, low_memory=False)
31 |     annotated_df.index = ['peak'+x.split('-')[-1] for x in annotated_df.index.values]
32 |     accesson_df = pandas.read_csv(accesson_csv, sep='\t', index_col=0,
33 |                                   engine='c', na_filter=False, low_memory=False)
34 |     accessons = list(set(accesson_df['group'].values))
35 |     accessons.sort()
36 |     accesson_annotate = pandas.DataFrame(columns=['genes', '-log10(P-value)'])
37 |     gene_annotate = pandas.DataFrame(columns=['accessons', '-log10(P-value)'])
38 |     for acc in accessons:
39 |         accPeak_df = peak_df.loc[accesson_df.loc[accesson_df['group']==acc].index]
40 |         genes = {}
41 |         for peak in accPeak_df.index.values:
42 |             chrom, base = accPeak_df.loc[peak, 'chromosome'], accPeak_df.loc[peak, 'base']
43 |             base_up, base_down = base - width, base + width
44 |             sameChrom_in_acc = accPeak_df.loc[accPeak_df['chromosome']==chrom]
45 |             sameChrom_overall = peak_df.loc[peak_df['chromosome']==chrom]
46 |             sameRegion_in_acc = numpy.where(abs(sameChrom_in_acc['base']-base)<=width)[0]
47 |             sameRegion_overall = numpy.where(abs(sameChrom_overall['base']-base)<=width)[0]
48 |             matrix = numpy.array([[len(sameRegion_in_acc), len(accPeak_df)-len(sameRegion_in_acc)],
49 |                                   [len(sameRegion_overall), len(peak_df)-len(sameRegion_overall)]])
50 |             odd, p_value = scipy.stats.fisher_exact(matrix)
51 |             if p_value<=pvalue:
52 |                 gene_symbol = annotated_df.loc[peak, 'Gene Name']
53 |                 log10_P = -numpy.log10(p_value)
54 |                 if gene_symbol not in genes.keys():
55 |                     genes[gene_symbol] = log10_P
56 |                 elif genes[gene_symbol] < log10_P:
57 |                     genes[gene_symbol] = log10_P
58 |         for gene in genes.keys():
59 |             if gene not in gene_annotate.index.values:
60 |                 gene_annotate.loc[gene, 'accessons'] = str(acc)
61 |                 gene_annotate.loc[gene, '-log10(P-value)'] = str(genes[gene])
62 |             else:
63 |                 gene_annotate.loc[gene, 'accessons'] += ';' + str(acc)
64 |                 gene_annotate.loc[gene, '-log10(P-value)'] += ';' + str(genes[gene])
65 |         accesson_annotate.loc[acc, 'genes'] = ';'.join(genes.keys())
66 |         accesson_annotate.loc[acc, '-log10(P-value)'] = ';'.join(list(map(str, genes.values())))
67 |     accesson_annotate.to_csv(project+'/matrix/Accesson_annotated.csv', sep='\t')
68 |     gene_annotate.to_csv(project+'/matrix/gene_annotated.csv', sep='\t')
69 |     return
70 | #
71 | #
72 | def get_gene_score(project):
73 |     gene_annotate = pandas.read_csv(project+'/matrix/gene_annotated.csv', sep='\t', index_col=0)
74 |     accesson_matrix = pandas.read_csv(project+'/matrix/Accesson_reads.csv', sep=',', index_col=0,
75 |                                       engine='c', na_filter=False, low_memory=False)
76 |     genes, matrix = [], []
77 |     for gene in gene_annotate.index.values:
78 |         accessons = gene_annotate.loc[gene, 'accessons'].split(';')
79 |         weight = list(map(float, gene_annotate.loc[gene, '-log10(P-value)'].split(';')))
80 |         if len(accessons)>0:
81 |             sub_matrix = accesson_matrix[accessons].values
82 |             expression = numpy.average(sub_matrix, axis=1, weights=weight).T
83 |             matrix.append(expression)
84 |             genes.append(gene)
85 |     matrix = numpy.array(matrix).T
86 |     expression_df = pandas.DataFrame(matrix, index=accesson_matrix.index, columns=genes)
87 |     expression_df.to_csv(project+'/matrix/gene_score.csv', sep=',')
88 |     return
89 | #
90 | #
91 | t1 = time.time()
92 | annotate_peak(options.s, genome=options.genome)
93 | annotate_accessons(options.s, int(options.width), float(options.pvalue))
94 | get_gene_score(options.s)
95 | print(time.time()-t1)
96 | #
97 | #
98 | 


--------------------------------------------------------------------------------
/code_v1.0.6/generate_markers_on_plots.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import warnings
  3 | warnings.filterwarnings("ignore")
  4 | #
  5 | import numpy
  6 | import pandas
  7 | from optparse import OptionParser
  8 | import matplotlib
  9 | matplotlib.use('Agg')
 10 | import matplotlib.pyplot as plt
 11 | from mpl_toolkits.mplot3d import Axes3D
 12 | import sys
 13 | import scipy.stats
 14 | #
 15 | #
 16 | opts = OptionParser()
 17 | usage = "Render marker on tSNE or trajectory plot\nusage: %prog -s project --cfile TSNE_by_APEC.csv --type motif --name RELA"
 18 | opts = OptionParser(usage=usage, version="%prog 1.0")
 19 | opts.add_option("-s", help="The project folder.")
 20 | opts.add_option("--cfile", help="TSNE_by_APEC.csv or monocle_reduced_dimension.csv file to use for rendering")
 21 | opts.add_option("--type", default='motif', help="type of marker to plot, can be motif, gene, or accesson")
 22 | opts.add_option("--name", help="name of marker to plot")
 23 | opts.add_option("--angle", default='30,30', help='Angles to rotate the 3D trajectory, default=30,30')
 24 | opts.add_option("--sharp", default='0', help='Cutoff range for deviation or expression, default=0, i.e. no sharpening')
 25 | options, arguments = opts.parse_args()
 26 | #
 27 | #
 28 | def draw_marker(options):
 29 |     if 'monocle_reduced' in options.cfile:
 30 |         tsne_df = pandas.read_csv(options.cfile, sep=',', index_col=0).T
 31 |     else:
 32 |         tsne_df = pandas.read_csv(options.cfile, sep='\t', index_col=0)
 33 |     tsne, cells = tsne_df.values, tsne_df.index.values
 34 |     if options.type=='motif':
 35 |         reads_df = pandas.read_csv(options.s+'/result/deviation_chromVAR.csv', sep=',', index_col=0,
 36 |                    engine='c', na_filter=False, low_memory=False).T
 37 |         motifs = []
 38 |         for tf in reads_df.columns.values:
 39 |             names = tf.split('-')[:-1]
 40 |             if options.name in names: motifs.append(tf)
 41 |         print(motifs)
 42 |         if len(motifs)==0:
 43 |             print("No corresponding marker!")
 44 |             sys.exit()
 45 |         else:
 46 |             reads = reads_df.ix[cells, motifs].values.sum(axis=1)
 47 |     elif options.type=='gene':
 48 |         reads_df = pandas.read_csv(options.s+'/matrix/gene_score.csv', sep=',', index_col=0,
 49 |                    engine='c', na_filter=False, low_memory=False)
 50 |         gene = list(set([options.name]).intersection(set(reads_df.columns.values)))
 51 |         if len(gene)==0:
 52 |             print("No corresponding marker!")
 53 |             sys.exit()
 54 |         else:
 55 |             reads = reads_df.loc[cells, gene].values.sum(axis=1)
 56 |     elif options.type=='accesson':
 57 |         reads_df = pandas.read_csv(options.s+'/matrix/Accesson_reads.csv', sep=',', index_col=0,
 58 |                    engine='c', na_filter=False, low_memory=False)
 59 |         normal = numpy.array([x/x.sum()*10000 for x in reads_df.values])
 60 |         reads_df = pandas.DataFrame(normal, index=reads_df.index, columns=reads_df.columns)
 61 |         reads = reads_df.loc[cells, options.name].values
 62 |     order = numpy.argsort(reads)
 63 |     if str(options.sharp)!='0':
 64 |         up, down = int(options.sharp.split(',')[-1]), int(options.sharp.split(',')[0])
 65 |         reads = numpy.clip(reads, down, up)
 66 | #
 67 |     if 'monocle_reduced' in options.cfile:
 68 |         clist = ['blue', 'silver', 'red']
 69 |         cmap = matplotlib.colors.LinearSegmentedColormap.from_list('mylist', clist, N=256)
 70 |         fig1 = plt.figure(1, figsize=(12,10))
 71 |         ax = fig1.add_subplot(111, projection='3d')
 72 |         im = ax.scatter(tsne[order,0], tsne[order,1], tsne[order,2], cmap=cmap, c=reads[order], edgecolors='none', s=10)
 73 |         beta1, beta2 = int(options.angle.split(',')[0]), int(options.angle.split(',')[1])
 74 |         ax.view_init(beta1, beta2)
 75 |         cbar = plt.colorbar(im, shrink=0.15, ticks=[reads.min(), reads.max()], aspect=8)
 76 |         cbar.ax.set_yticklabels([round(reads.min(),2), round(reads.max(),2)])
 77 |         width, height, rad = tsne[:,0].max()-tsne[:,0].min(), tsne[:,1].max()-tsne[:,1].min(), tsne[:,2].max()-tsne[:,2].min()
 78 |         ax.set_xlim((tsne[:,0].min()-0.01*width, tsne[:,0].max()+0.01*width))
 79 |         ax.set_ylim((tsne[:,1].min()-0.01*height, tsne[:,1].max()+0.01*height))
 80 |         ax.set_zlim((tsne[:,2].min()-0.01*rad, tsne[:,2].max()+0.01*rad))
 81 |         ax.set_zticks([])
 82 |     else:
 83 |         fig1 = plt.figure(1, figsize=(6,5))
 84 |         ax = fig1.add_subplot(111)
 85 |         im = ax.scatter(tsne[order,0], tsne[order,1],  cmap='Spectral_r', c=reads[order], edgecolors='none', s=20)
 86 |         cbar = plt.colorbar(im, shrink=0.15, ticks=[reads.min(), reads.max()], aspect=8)
 87 |         cbar.ax.set_yticklabels([round(reads.min(),2), round(reads.max(),2)])
 88 |     ax.grid(False)
 89 |     ax.set_xticks([])
 90 |     ax.set_yticks([])
 91 |     ax.set_title(options.name)
 92 |     fig1.savefig(options.s+'/figure/'+options.type+'_'+options.name+'_on_'+options.cfile.split('/')[-1].split('.')[0]+'.pdf',
 93 |                  bbox_inches='tight')
 94 |     return
 95 | #
 96 | #
 97 | draw_marker(options)
 98 | #
 99 | #
100 | #
101 | #
102 | 


--------------------------------------------------------------------------------
/code_v1.0.6/generate_superEnhancer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | #
 5 | import numpy
 6 | import pandas
 7 | from optparse import OptionParser
 8 | import matplotlib
 9 | matplotlib.use('Agg')
10 | import matplotlib.pyplot as plt
11 | import sys
12 | import scipy.stats
13 | #
14 | #
15 | opts = OptionParser()
16 | usage = "Search super enhancers\nusage: %prog -s project"
17 | opts = OptionParser(usage=usage, version="%prog 1.0")
18 | opts.add_option("-s", help="The project folder.")
19 | options, arguments = opts.parse_args()
20 | #
21 | def search_supper(options):
22 |     supper_range = 1e6
23 |     peaks = numpy.array([x.split()[:3] for x in open(options.s+'/peak/top_peaks.bed').readlines()])
24 |     chroms = list(set(peaks[:, 0]))
25 |     chroms.sort()
26 |     accesson_df = pandas.DataFrame.from_csv(options.s+'/matrix/Accesson_peaks.csv', sep='\t')
27 |     accessons = list(set(accesson_df['group'].values))
28 |     all_suppers, all_locate, all_base = [], [], []
29 |     for access in accessons:
30 |         peaks_group = accesson_df.loc[accesson_df['group']==access].index.values
31 |         peaks_index = [int(x[4:]) for x in peaks_group]
32 |         peaks_info = peaks[peaks_index, :]
33 |         peaks_dict = {x:[] for x in chroms}
34 |         peaks_label = {x:[] for x in chroms}
35 |         for ip,pp in enumerate(peaks_info):
36 |             peaks_dict[pp[0]].append((int(pp[1])+int(pp[2]))/2)
37 |             peaks_label[pp[0]].append(peaks_index[ip])
38 |         peaks_label = {x:numpy.array(peaks_label[x]) for x in peaks_label.keys()}
39 |         suppers, locate, base = [], [], []
40 |         for chrom in peaks_dict.keys():
41 |             if len(peaks_dict[chrom])>=2:
42 |                 position = numpy.array(peaks_dict[chrom])
43 |                 supper = [numpy.where(abs(position-x)<=supper_range)[0] for x in position if 
44 |                       len(numpy.where(abs(position-x)<=supper_range)[0])>1]
45 |                 supper = ['-'.join(map(str, x)) for x in supper]
46 |                 supper = list(set(supper))
47 |                 supper = [list(map(int, x.split('-'))) for x in supper]
48 |                 supper_peaks = numpy.array(['-'.join(map(str, peaks_label[chrom][x])) for x in supper])
49 |                 if len(supper_peaks)>0:
50 |                     for ii,ss in enumerate(supper_peaks):
51 |                         peaks_in = map(int, ss.split('-'))
52 |                         start = peaks[peaks_in, 1:].astype(int).min()
53 |                         end = peaks[peaks_in, 1:].astype(int).max()
54 |                         delta = numpy.array([abs(peaks_in[i+1]-x) for i,x in enumerate(peaks_in[:-1])])
55 |                         close = numpy.where(delta<=2)[0]
56 |                         percent = len(close)/float(len(delta))
57 |                         if (len(delta)>=2) & (percent>0.5) :
58 |                             suppers.append(ss)
59 |                             locate.append(access)
60 |                             base.append(chrom+':'+str(start)+'-'+str(end))
61 |         all_suppers.extend(suppers)
62 |         all_locate.extend(locate)
63 |         all_base.extend(base)
64 |     supper_df = pandas.DataFrame(numpy.array([all_locate, all_base]).T, index=all_suppers, columns=['peaks', 'position'])
65 |     supper_df.to_csv(options.s+'/result/potential_super_enhancer.csv', sep='\t')
66 |     return
67 | #
68 | #
69 | #
70 | #
71 | search_supper(options)
72 | #
73 | #
74 | #
75 | #
76 | 


--------------------------------------------------------------------------------
/code_v1.0.6/generate_umap.py:
--------------------------------------------------------------------------------
 1 | import sys,getopt,numpy,pandas,umap,scipy.stats
 2 | from sklearn.decomposition import PCA
 3 | import matplotlib
 4 | matplotlib.use('Agg')
 5 | import matplotlib.pyplot as plt
 6 | #
 7 | #
 8 | def get_parameters(argv):
 9 |     project, cellinfo, rand_stat, norm_method = '', '', 0, 'zscore'
10 |     help_info = ['Plot UMAP figure \n',
11 |                  'python generate_umap.py -s <project path> \n',
12 |                  '\t\t\t -c <cluster.csv or filtered_cell.csv file> \n',
13 |                  '\t\t\t -r <seed of random_state for UMAP, default=0 \n',
14 |                  '\t\t\t -n <normalization method, zscore or probability, default=zscore> \n']
15 |     help_info = ''.join(help_info)
16 |     try:
17 |         opts, args = getopt.getopt(argv,"hs:c:r:n:")
18 |     except getopt.GetoptError:
19 |         print('Incorrect input parameters!')
20 |         print(help_info)
21 |         sys.exit(2)
22 |     for opt,arg in opts:
23 |         if opt=='-h':
24 |             print(help_info)
25 |             sys.exit()
26 |         elif opt=='-s':
27 |             project = arg
28 |         elif opt=='-c':
29 |             cellinfo = arg
30 |         elif opt=='-r':
31 |             rand_stat = int(arg)
32 |         elif opt=='-n':
33 |             norm_method = arg
34 |     return project, cellinfo, rand_stat, norm_method
35 | #
36 | #
37 | def run_umap(project, cellinfo, rand_stat=0, norm_method='zscore'):
38 |     mat_df = pandas.read_csv(project+'/matrix/Accesson_reads.csv', sep=',', index_col=0,
39 |                              engine='c', na_filter=False, low_memory=False)
40 |     if norm_method=='zscore':
41 |         matrix = scipy.stats.zscore(mat_df.values, axis=1)
42 |     elif norm_method=='probability':
43 |         matrix = numpy.array([x*10000.0/x.sum() for x in mat_df.values])
44 |     else:
45 |         print('-n should be zscore or probability')
46 |         sys.exit()
47 |     umap_result = umap.UMAP(n_components=2, random_state=rand_stat).fit_transform(matrix)
48 |     cellinfo_df = pandas.read_csv(cellinfo, sep='\t', index_col=0, engine='c', na_filter=False, low_memory=False)
49 |     umap_df = pandas.DataFrame(umap_result, index=cellinfo_df.index, columns=['UMAP1', 'UMAP2'])
50 |     umap_df.to_csv(project+'/result/UMAP_by_APEC.csv', sep='\t')
51 | #
52 |     if 'notes' in cellinfo_df.columns.values: cellinfo_df['cluster'] = cellinfo_df['notes']
53 |     cTypes = list(set(cellinfo_df['cluster'].values))
54 |     cTypes.sort()
55 |     cTypeIndex = [numpy.where(cellinfo_df['cluster'].values==x) for x in cTypes]
56 |     colors = numpy.array(['pink', 'red', '#377eb8', 'green', 'skyblue', 'lightgreen', 'gold',
57 |                       '#ff7f00', '#000066', '#ff3399', '#a65628', '#984ea3', '#999999',
58 |                       '#e41a1c', '#dede00', 'b', 'g', 'c', 'm', 'y', 'k',
59 |                       '#ADFF2F', '#7CFC00', '#32CD32', '#90EE90', '#00FF7F', '#3CB371',
60 |                       '#008000', '#006400', '#9ACD32', '#6B8E23', '#556B2F', '#66CDAA',
61 |                       '#8FBC8F', '#008080', '#DEB887', '#BC8F8F', '#F4A460', '#B8860B',
62 |                       '#CD853F', '#D2691E', '#8B4513', '#A52A2A', '#778899', '#2F4F4F',
63 |                       '#FFA500', '#FF4500', '#DA70D6', '#FF00FF', '#BA55D3', '#9400D3',
64 |                       '#8B008B', '#9370DB', '#663399', '#4B0082'])
65 |     fig2, axes = plt.subplots(1, figsize=(15,15))
66 |     for ict,ct in enumerate(cTypes):
67 |         axes.scatter(umap_result[cTypeIndex[ict], 0], umap_result[cTypeIndex[ict], 1], c=colors[ict], label=ct, s=50)
68 |     axes.legend(bbox_to_anchor=(1, 1), loc=2, borderaxespad=0.)
69 |     fig2.savefig(project+'/figure/UMAP_by_APEC.pdf', bbox_inches='tight')
70 |     return
71 | #
72 | #
73 | if __name__=='__main__':
74 |     project, cellinfo, rand_stat, norm_method = get_parameters(sys.argv[1:])
75 |     run_umap(project, cellinfo, rand_stat, norm_method)
76 | #
77 | #
78 | #
79 | 


--------------------------------------------------------------------------------
/code_v1.0.6/prepare_countMatrix.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | import os,pandas
 5 | from optparse import OptionParser
 6 | import subroutines
 7 | import scipy.io,scipy.sparse
 8 | #
 9 | opts = OptionParser()
10 | usage = "Align reads to build matrix\nusage: %prog -s project --fa chr.fa --bg bg --meme motif.meme --np 4"
11 | opts = OptionParser(usage=usage, version="%prog 1.0")
12 | opts.add_option("-s", help="The project folder.")
13 | opts.add_option("--fa", default='../reference/hg19_chr.fa', help="Genome fasta file, default=../reference/hg19_chr.fa")
14 | opts.add_option("--bg", default='../reference/tier1_markov1.norc.txt',
15 |                 help="Background file, default=../reference/tier1_markov1.norc.txt")
16 | opts.add_option("--pvalue", default=0.00005, help="P-value threshold for FIMO, default=0.00005")
17 | opts.add_option("--meme", default='../reference/JASPAR2018_CORE_vertebrates_redundant_pfms_meme.txt',
18 |                 help="Motif file, default=../reference/JASPAR2018_CORE_vertebrates_redundant_pfms_meme.txt")
19 | opts.add_option("--np", default=1, help="Number of CPU cores used for motif searching, default=1")
20 | options, arguments = opts.parse_args()
21 | #
22 | matrix_folder = options.s + '/matrix/'
23 | peaks_folder = options.s + '/peak/'
24 | work_folder = options.s + '/work/'
25 | motif_folder = matrix_folder + '/motif'
26 | peaks_file = peaks_folder + '/top_peaks.bed'
27 | cell_info = options.s + '/data/cell_info.csv'
28 | motifFasta = matrix_folder + '/motif.fasta'
29 | TFmatrix_file = matrix_folder + '/motif_TF.csv'
30 | bam_file = peaks_folder + "/mergeAll.bam"
31 | reads_matrix = matrix_folder + "/reads.mtx"
32 | #
33 | if not os.path.exists(matrix_folder): os.popen('mkdir ' + matrix_folder)
34 | if os.path.exists(motif_folder): os.popen('rm -rf ' + motif_folder)
35 | os.popen('mkdir ' + motif_folder)
36 | #
37 | #### run FIMO for motif-site searching
38 | os.popen('bedtools getfasta -fi ' + options.fa + ' -bed ' + peaks_file + ' -fo ' + motifFasta)
39 | subroutines.batch_fimo(options.bg, options.pvalue, options.meme, motifFasta, motif_folder, int(options.np))
40 | #
41 | #### motif annotation
42 | subroutines.score_peaks(peaks_file, motif_folder, TFmatrix_file)
43 | #
44 | #### count reads for peaks
45 | cell_info_df = pandas.read_csv(cell_info, sep='\t', index_col=0)
46 | matrix = subroutines.counts_per_peak(bam_file, peaks_file, reads_matrix, cell_info_df)
47 | matrix = scipy.sparse.coo_matrix(matrix.T)
48 | scipy.io.mmwrite(reads_matrix, matrix)
49 | #
50 | #
51 | subroutines.QC_table(cell_info, work_folder, matrix_folder)
52 | #
53 | #
54 | 


--------------------------------------------------------------------------------
/code_v1.0.6/prepare_geneScore.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import warnings
  3 | warnings.filterwarnings("ignore")
  4 | import os,numpy,pandas,sys,scipy.io,scipy.sparse,time,numba
  5 | from optparse import OptionParser
  6 | #
  7 | #
  8 | opts = OptionParser()
  9 | usage = "Evaluate gene score by TSS peaks\nusage: %prog -s project --gtf hg19.gtf --distal 20000"
 10 | opts = OptionParser(usage=usage, version="%prog 1.0")
 11 | opts.add_option("-s", help="The project folder.")
 12 | opts.add_option("--gtf", default='../reference/hg19_RefSeq_genes.gtf',
 13 |                 help="gtf file for genome, default=../reference/hg19_RefSeq_genes.gtf")
 14 | opts.add_option("--distal", default=20000,
 15 |                 help="distal region around TSS for peak searching, default=20000")
 16 | options, arguments = opts.parse_args()
 17 | #
 18 | #
 19 | def get_tss_region(options):
 20 |     mm10_df = pandas.read_csv(options.gtf, sep='\t', index_col=0)
 21 |     genes = list(set(mm10_df['name2']))
 22 |     genes.sort()
 23 |     mm10_df.index = mm10_df['name']
 24 |     names, tss = [], []
 25 |     for symbol in genes:
 26 |         sub_df = mm10_df.loc[mm10_df['name2']==symbol]
 27 |         if len(sub_df.index.values)>=1:
 28 |             chrom = list(set(sub_df['chrom'].values))
 29 |             strand = list(set(sub_df['strand'].values))
 30 |             if len(chrom)==1:
 31 |                 if strand[0]=='+':
 32 |                     starts = list(set(map(str, sub_df['txStart'].values)))
 33 |                     start = ','.join(starts)
 34 |                 elif strand[0]=='-':
 35 |                     starts = list(set(map(str, sub_df['txEnd'].values)))
 36 |                     start = ','.join(starts)
 37 |                 names.append(symbol)
 38 |                 tss.append([chrom[0], start])
 39 |     tss = numpy.array(tss)
 40 |     tss_df = pandas.DataFrame(tss, index=names, columns=['chrom', 'tss'])
 41 |     tss_df.to_csv(options.s+'/peak/genes_tss_region.csv', sep='\t')
 42 |     return
 43 | #
 44 | #
 45 | def get_tss_peaks(options):
 46 |     peaks = [[x.split()[0], (int(x.split()[1])+int(x.split()[2]))/2]
 47 |              for x in open(options.s+'/peak/top_filtered_peaks.bed').readlines()]
 48 |     peaks_df = pandas.DataFrame(peaks, index=[str(x) for x in numpy.arange(0,len(peaks))],
 49 |                                 columns=['chrom', 'center'])
 50 |     tss_df = pandas.read_csv(options.s+'/peak/genes_tss_region.csv', sep='\t', index_col=0)
 51 |     for gene in tss_df.index.values:
 52 |         chrom, tsses = tss_df.ix[gene, 'chrom'], tss_df.ix[gene, 'tss']
 53 |         tsses = map(int, tsses.split(','))
 54 |         chr_peaks = peaks_df.loc[peaks_df['chrom']==chrom]
 55 |         proxim_peaks, distal_peaks = [], []
 56 |         for tss in tsses:
 57 |             peaks1 = chr_peaks.loc[abs(chr_peaks['center']-tss)<=2000].index.values
 58 |             peaks2 = chr_peaks.loc[abs(chr_peaks['center']-tss)<=int(options.distal)].index.values
 59 |             proxim_peaks.extend(peaks1)
 60 |             distal_peaks.extend(peaks2)
 61 |         proxim_peaks = list(set(proxim_peaks))
 62 |         distal_peaks = list(set(distal_peaks)-set(proxim_peaks))
 63 |         if len(proxim_peaks)==0: proxim_peaks = ['NONE']
 64 |         if len(distal_peaks)==0: distal_peaks = ['NONE']
 65 |         proxim_peaks = ';'.join(proxim_peaks)
 66 |         tss_df.ix[gene, 'proximal'] = proxim_peaks
 67 |         distal_peaks = ';'.join(distal_peaks)
 68 |         tss_df.ix[gene, 'distal'] = distal_peaks
 69 |     tss_df.to_csv(options.s+'/peak/genes_tss_peaks.csv', sep='\t')
 70 |     return
 71 | #
 72 | #
 73 | def get_score_from_peaks(options):
 74 |     tss_df = pandas.read_csv(options.s+'/peak/genes_tss_peaks.csv', sep='\t', index_col=0)
 75 |     reads = scipy.sparse.csr_matrix(scipy.io.mmread(options.s+'/matrix/filtered_reads.mtx')).T
 76 |     cells_df = pandas.read_csv(options.s+'/matrix/filtered_cells.csv', sep='\t', index_col=0)
 77 |     all_peaks = numpy.arange(0, reads.shape[1])
 78 |     genes, score = [], []
 79 |     for igene,gene in enumerate(tss_df.index.values):
 80 |         distal = tss_df.loc[gene, 'distal'].split(';')
 81 |         proximal = tss_df.loc[gene, 'proximal'].split(';')
 82 |         if distal==['NONE']:
 83 |             distal = []
 84 |         else:
 85 |             distal = list(map(int, distal))
 86 |         if proximal==['NONE']:
 87 |             proximal = []
 88 |         else:
 89 |             proximal = list(map(int, proximal))
 90 |         distal = list(set(distal).union(set(proximal)))
 91 |         distal = list(set(distal).intersection(set(all_peaks)))
 92 |         if len(distal)>0:
 93 |             signal = reads[:, distal].A.mean(axis=1)
 94 |             genes.append(gene)
 95 |             score.append(signal)
 96 |     score = numpy.array(score)
 97 |     score_df = pandas.DataFrame(score, index=genes, columns=cells_df.index)
 98 |     score_per_cell = score.sum(axis=0)
 99 |     R_wave = [numpy.log(x*10000.0/score_per_cell[i]+1) for i,x in enumerate(score.T)]
100 |     R_wave = numpy.array(R_wave)
101 |     normal_df = pandas.DataFrame(R_wave.T, index=genes, columns=cells_df.index)
102 |     normal_df.to_csv(options.s+'/matrix/genes_scored_by_peaks.csv', sep=',')
103 |     return
104 | #
105 | #
106 | get_tss_region(options)
107 | get_tss_peaks(options)
108 | get_score_from_peaks(options)
109 | #
110 | #
111 | 


--------------------------------------------------------------------------------
/code_v1.0.6/prepare_mapping.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | import os
 5 | import sys
 6 | from optparse import OptionParser
 7 | import subroutines
 8 | from multiprocessing import Pool
 9 | #
10 | #
11 | opts = OptionParser()
12 | usage = "Map sequencing data\nusage: %prog -s project --index bowtie2-index --picard picard.jar --tss TSS.txt --np 4"
13 | opts = OptionParser(usage=usage, version="%prog 1.0")
14 | opts.add_option("-s", help="The project folder")
15 | opts.add_option("--index", default="../reference/hg19",
16 |                 help="The reference file path in bowtie2/indexes folder, default=../reference/hg19")
17 | opts.add_option("--picard", default="../reference/picard.jar",
18 |                 help="The picard.jar file path, default=../reference/picard.jar")
19 | opts.add_option("--tss", default="../reference/hg19_refseq_genes_TSS.txt",
20 |                 help="The TSS file path, can be downloaded from RefSeq, default=../reference/hg19_refseq_genes_TSS.txt")
21 | opts.add_option("--np", default=1, help="Number of CPUs used for mapping, default=1")
22 | options, arguments = opts.parse_args()
23 | #
24 | #
25 | #
26 | def mapping(par):
27 |     work_dir, cell, options, input1, input2, chr_list = par[0], par[1], par[2], par[3], par[4], par[5]
28 |     sam = work_dir + cell + '.sam'
29 |     bam = work_dir + cell + '.bam'
30 |     log = work_dir + cell + '.map.log'
31 |     sorted_bam = work_dir + cell + '.sorted.bam'
32 |     filtered_bam = work_dir + cell + '.filtered.bam'
33 |     marked_bam = work_dir + cell + '.marked.bam'
34 |     removed_duplicate = work_dir + cell + '.dups.log'
35 |     quality_state = work_dir + cell + '.stats.log'
36 |     hist_log = work_dir + cell + '.hist.log'
37 |     hist_pdf = work_dir + cell + '.hist.pdf'
38 |     refSeqTSS = work_dir + cell + '.RefSeqTSS'
39 | #
40 | #    os.popen('bowtie2 -X2000 -p ' + options.np + ' --rg-id ' + cell + ' -x ' + options.index
41 | #        + ' -1 ' + input1 + ' -2 ' + input2 + ' -S ' + sam + ' 2> ' + log)
42 |     os.popen('bowtie2 -X2000 -p 1 --rg-id ' + cell + ' -x ' + options.index
43 |         + ' -1 ' + input1 + ' -2 ' + input2 + ' -S ' + sam + ' 2> ' + log)
44 |     os.popen('samtools view -bS ' + sam + ' -o ' + bam)
45 |     os.popen('rm ' + sam)
46 |     os.popen('java -XX:+UseSerialGC -Xmx1g -jar ' + options.picard + ' SortSam SO=coordinate VALIDATION_STRINGENCY=SILENT I='
47 |         + bam + ' O=' + sorted_bam)
48 |     os.popen('samtools index ' + sorted_bam)
49 |     os.popen('samtools view -b -q 30 ' + sorted_bam + ' -o ' + filtered_bam + ' ' + chr_list)
50 |     os.popen('java -XX:+UseSerialGC -Xmx1g -jar ' + options.picard + ' MarkDuplicates INPUT=' + filtered_bam +' OUTPUT='
51 |         + marked_bam + ' METRICS_FILE=' + removed_duplicate
52 |         + ' REMOVE_DUPLICATES=true VALIDATION_STRINGENCY=SILENT')
53 |     os.popen('samtools index ' + marked_bam)
54 |     os.popen('echo -e "Chromosome\tLength\tProperPairs\tBadPairs:Raw" >> ' + quality_state)
55 |     os.popen('samtools idxstats ' + sorted_bam + ' >> ' + quality_state)
56 |     os.popen('java -XX:+UseSerialGC -Xmx1g -jar ' + options.picard + ' CollectInsertSizeMetrics VALIDATION_STRINGENCY=SILENT I='
57 |         + marked_bam + ' O=' + hist_log + ' H=' + hist_pdf + ' W=1000')
58 | #    subroutines.draw_TSS_insert(options.tss, marked_bam, refSeqTSS)
59 |     return
60 | #
61 | #
62 | chr_list = 'chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY'
63 | #
64 | fastq_list = [x for x in os.listdir(options.s+'/data') if x[-11:]=='.trim.fastq']
65 | fastq_list.sort()
66 | cell_list = [x.split('_')[0] for x in fastq_list]
67 | cell_list = list(set(cell_list))
68 | cell_list.sort()
69 | if not os.path.exists(options.s+'/work'): os.popen('mkdir ' + options.s + '/work')
70 | #
71 | parameters = []
72 | for cell in cell_list:
73 |     work_dir = options.s + '/work/' + cell + '/'
74 |     if os.path.exists(work_dir): os.popen('rm -rf ' + work_dir)
75 |     os.popen('mkdir ' + work_dir)
76 |     input1, input2 = options.s+'/data/'+cell+'_1.trim.fastq', options.s+'/data/'+cell+'_2.trim.fastq'
77 |     par = [work_dir, cell, options, input1, input2, chr_list]
78 |     parameters.append(par)
79 | #    mapping(par)
80 | #
81 | pool = Pool(int(options.np))
82 | pool.map(mapping, parameters)
83 | pool.close()
84 | pool.join()
85 | #
86 | #
87 | for cell in cell_list:
88 |     work_dir = options.s + '/work/' + cell + '/'
89 |     marked_bam = work_dir + cell + '.marked.bam'
90 |     refSeqTSS = work_dir + cell + '.RefSeqTSS'
91 |     subroutines.draw_TSS_insert(options.tss, marked_bam, refSeqTSS)
92 | #
93 | #
94 | 


--------------------------------------------------------------------------------
/code_v1.0.6/prepare_peakCalling.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | import warnings
  4 | warnings.filterwarnings("ignore")
  5 | #
  6 | import os
  7 | import numpy
  8 | import sys
  9 | from optparse import OptionParser
 10 | import subroutines
 11 | #
 12 | #
 13 | opts = OptionParser()
 14 | usage = "Call peaks\nusage: %prog -s project --blist blacklist.bed --fa genome_chr.fa --tss tssFile --logq 3"
 15 | opts = OptionParser(usage=usage, version="%prog 1.0")
 16 | opts.add_option("-s", help="The project folder.")
 17 | opts.add_option("--picard", default="../reference/picard.jar",
 18 |                 help="The picard.jar file path, default=../reference/picard.jar")
 19 | opts.add_option("--blist", default='../reference/hg19_blacklist.JDB.bed',
 20 |                 help="Blacklist.bed, default=../reference/hg19_blacklist.JDB.bed")
 21 | opts.add_option("--fa", default='../reference/hg19_chr.fa',
 22 |                 help="Genome_chr.fa, default=../reference/hg19_chr.fa")
 23 | opts.add_option('--tss', default='../reference/hg19_refseq_genes_TSS.txt',
 24 |                 help='TSS file, default=../reference/hg19_refseq_genes_TSS.txt')
 25 | opts.add_option('--ref', default='hg19', help='Name of genome reference, default=hg19')
 26 | opts.add_option('--logq', default='3',
 27 |                 help='Threshold of -log(p-value) for top peaks, default=3.')
 28 | options, arguments = opts.parse_args()
 29 | #
 30 | workspace_folder = options.s + '/work/'
 31 | peak_folder = options.s + '/peak/'
 32 | genome_fasta = options.fa
 33 | tssFile = options.tss
 34 | os.popen('mkdir ' + peak_folder)
 35 | #
 36 | #
 37 | print('!!!!!!  merge all marked bam files  !!!!!!')
 38 | bam_folder = [x for x in os.listdir(workspace_folder)]
 39 | bam_folder.sort()
 40 | print('cells number:', len(bam_folder))
 41 | marked_bam = []
 42 | #merged_raw = peak_folder + 'mergeAll.raw.bam'
 43 | merged_bam = peak_folder + 'mergeAll.bam'
 44 | for folder in bam_folder:
 45 |     path = workspace_folder + folder + '/'
 46 |     if len(folder.split('.'))<=1:
 47 |         marked_bam.extend([path + x for x in os.listdir(path) if x[-10:]=='marked.bam'])
 48 | if len(marked_bam)<=1000:
 49 |     marked_bam = ' '.join(marked_bam)
 50 |     os.popen('samtools merge -f ' + merged_bam + ' ' + marked_bam)
 51 | else:
 52 |     n_batch = len(marked_bam)//1000 + 1
 53 |     temps = []
 54 |     for i_batch in range(0, n_batch):
 55 |         temp_bam = peak_folder+'temp_'+str(i_batch)+'.bam'
 56 |         temps.append(temp_bam)
 57 |         start, end = i_batch*1000, min((i_batch+1)*1000, len(marked_bam))
 58 |         marked = ' '.join(marked_bam[start:end])
 59 |         os.popen('samtools merge -f ' + temp_bam + ' ' + marked)
 60 |         os.popen('samtools index ' + temp_bam)
 61 |     all_temp = ' '.join(temps)
 62 |     os.popen('samtools merge -f ' + merged_bam + ' ' + all_temp)
 63 | #
 64 | os.popen('samtools index ' + merged_bam)
 65 | print('!!!!!!  merge done  !!!!!!')
 66 | #
 67 | hist_log = peak_folder + 'mergeAll.hist.log'
 68 | hist_pdf = peak_folder + 'mergeAll.hist.pdf'
 69 | os.popen('java -XX:+UseSerialGC -Xmx1g -jar '+options.picard+' CollectInsertSizeMetrics VALIDATION_STRINGENCY=SILENT I='
 70 |     + merged_bam + ' O=' + hist_log + ' H=' + hist_pdf + ' W=1000')
 71 | #
 72 | refSeqTSS = peak_folder + 'mergeAll.RefSeqTSS'
 73 | subroutines.draw_TSS_insert(tssFile, merged_bam, refSeqTSS)
 74 | #
 75 | print('!!!!!!  call peak by macs2  !!!!!!')
 76 | peak_file = peak_folder + 'peaks'
 77 | os.popen('macs2 callpeak --nomodel -t ' + merged_bam + ' -n '
 78 |     + peak_file + ' --nolambda --keep-dup all --call-summits')
 79 | print('!!!!!!  call peak done  !!!!!!')
 80 | #
 81 | summit = peak_folder + 'peaks_summits.bed'
 82 | filtered_peak = peak_folder + 'filtered_peaks.bed'
 83 | if options.blist:
 84 |     print('!!!!!!  filter peaks  !!!!!!')
 85 |     os.popen('bedtools intersect -v -a ' + summit + ' -b ' + options.blist
 86 |         + " | sort -k5 -nr > " + filtered_peak)
 87 |     print('!!!!!!  filter peaks done  !!!!!!')
 88 | else:
 89 |     os.popen('sort -k5 -nr ' + summit + ' > ' + filtered_peak)
 90 | 
 91 | print('!!!!!!  get top N peaks by q-value  !!!!!!')
 92 | fold_rank = numpy.loadtxt(filtered_peak, 'str', delimiter='\t')
 93 | fold_rank[:, 1] = numpy.array(map(int, fold_rank[:, 1])) - 249  # 250
 94 | fold_rank[:, 2] = numpy.array(map(int, fold_rank[:, 2])) + 250
 95 | toppeaks = peak_folder + 'temp01.bed'
 96 | top_peaks = peak_folder + 'top_peaks.bed'
 97 | with open(toppeaks, 'w') as output:
 98 |     for peak in fold_rank:
 99 |         if float(peak[-1])>=float(options.logq):
100 | #            print >> output, peak[0]+'\t'+peak[1]+'\t'+peak[2]
101 |             output.write(peak[0]+'\t'+peak[1]+'\t'+peak[2]+'\n')
102 | os.popen('bedtools sort -i ' + toppeaks + ' > ' + top_peaks)
103 | print('!!!!!!  get top peaks done  !!!!!!')
104 | #
105 | #
106 | print('!!!!!!  get transposase bias by GC content  !!!!!!')
107 | trans_bias = peak_folder + 'transposase_bias.bed'
108 | temp02_file = peak_folder + 'temp02.bed'
109 | temp03_file = peak_folder + 'temp03.bed'
110 | with open(top_peaks) as annotate_file, open(temp02_file, 'w') as temp02:
111 |     for i, line in enumerate(annotate_file):
112 |         words = line.split('\t')
113 |         leave = words[0:3]
114 | #        print >> temp02, '\t'.join(leave)
115 |         temp02.write('\t'.join(leave)+'\n')
116 | #
117 | # use GC contents to estimate the transposase bias
118 | os.popen('bedtools nuc -fi ' + genome_fasta + ' -bed ' + temp02_file + ' > ' + temp03_file)
119 | with open(temp03_file) as temp03, open(trans_bias, 'w') as bias:
120 |     for i, line in enumerate(temp03):
121 |         if i>0:
122 |             words = line.split('\t')
123 |             leave = words[0:3] + [words[4]]
124 | #            print >> bias, '\t'.join(leave)
125 |             bias.write('\t'.join(leave)+'\n')
126 | print('!!!!!!  get bias done  !!!!!!')
127 | #
128 | #
129 | 


--------------------------------------------------------------------------------
/code_v1.0.6/prepare_premappedMatrix.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | import os
 5 | import numpy
 6 | import sys
 7 | from optparse import OptionParser
 8 | import subroutines
 9 | #
10 | #
11 | opts = OptionParser()
12 | usage = "Organize premapped data\nusage: %prog -s project --ref ref --fa chr.fa --bg bg.txt --meme motifs.meme --np 4"
13 | opts = OptionParser(usage=usage, version="%prog 1.0.6")
14 | opts.add_option("-s", help="The project folder")
15 | opts.add_option("--motif", default='yes', help="=yes if users want to caclulate motif bias, default=yes. "
16 |                 +"It's necessory for cell clustering by motifs (chromVAR) and searching differential motifs.")
17 | #opts.add_option("--gene", default='no', help="=yes if users want to estimate gene score, default=no. "
18 | #                +"It's necessory for searching differential genes.")
19 | #opts.add_option("--ref", default='hg19', help="Genome reference, default=hg19")
20 | opts.add_option("--fa", default='../reference/hg19_chr.fa', help="Genome fasta file, default=../reference/hg19_chr.fa")
21 | opts.add_option("--bg", default='../reference/tier1_markov1.norc.txt',
22 |                 help="Background file, default=../reference/tier1_markov1.norc.txt")
23 | opts.add_option("--pvalue", default=0.00005, help="P-value threshold for FIMO, default=0.00005")
24 | opts.add_option("--meme", default='../reference/JASPAR2018_CORE_vertebrates_redundant_pfms_meme.txt',
25 |                 help="Motif file, default=../reference/JASPAR2018_CORE_vertebrates_redundant_pfms_meme.txt")
26 | #opts.add_option("--distal", default=20000,
27 | #                help="distal region around TSS for peak searching, default=20000")
28 | opts.add_option("--np", default=1, help="Number of CPU cores to use, default=1")
29 | options, arguments = opts.parse_args()
30 | #
31 | #
32 | #os.popen('cp '+options.s+'/data/cell_info.csv '+options.s+'/matrix/filtered_cells.csv')
33 | #os.popen('cp '+options.s+'/peak/top_peaks.bed '+options.s+'/peak/top_filtered_peaks.bed')
34 | #
35 | if options.motif=='yes':
36 |     top_peaks = options.s + '/peak/top_filtered_peaks.bed'
37 |     print('!!!!!!  get transposase bias by GC content  !!!!!!')
38 |     trans_bias = options.s + '/peak/transposase_bias_filtered.bed'
39 |     temp02_file = options.s + '/peak/temp02.bed'
40 |     temp03_file = options.s + '/peak/temp03.bed'
41 |     with open(top_peaks) as annotate_file, open(temp02_file, 'w') as temp02:
42 |         for i, line in enumerate(annotate_file):
43 |             words = line.split('\t')
44 |             leave = words[0:3]
45 | #            print >> temp02, '\t'.join(leave)
46 |             temp02.write('\t'.join(leave)+'\n')
47 |     os.popen('bedtools nuc -fi ' + options.fa + ' -bed ' + temp02_file + ' > ' + temp03_file)
48 |     with open(temp03_file) as temp03, open(trans_bias, 'w') as bias:
49 |         for i, line in enumerate(temp03):
50 |             if i>0:
51 |                 words = line.split('\t')
52 |                 leave = words[0:3] + [words[4]]
53 | #                print >> bias, '\t'.join(leave)
54 |                 bias.write('\t'.join(leave)+'\n')
55 |     print('!!!!!!  get bias done  !!!!!!')
56 | #
57 |     motif_folder = options.s + '/matrix/motif'
58 |     peaks_file = options.s + '/peak/top_filtered_peaks.bed'
59 |     if os.path.exists(motif_folder): os.popen('rm -rf ' + motif_folder)
60 |     os.popen('mkdir ' + motif_folder)
61 |     # run FIMO for motif-site searching
62 |     motifFasta = options.s + '/matrix/motif.fasta'
63 |     os.popen('bedtools getfasta -fi ' + options.fa + ' -bed ' + peaks_file + ' -fo ' + motifFasta)
64 |     subroutines.batch_fimo(options.bg, options.pvalue, options.meme, motifFasta, motif_folder, int(options.np))
65 |     # motif annotation
66 |     TFmatrix_file = options.s + '/matrix/motif_filtered.csv'
67 |     subroutines.score_peaks(peaks_file, motif_folder, TFmatrix_file)
68 | #
69 | #
70 | #if options.gene=='yes':
71 | #    gtf = '../reference/hg19_RefSeq_genes.gtf'
72 | #    if options.ref=='mm10': gtf = '../reference/mm10_RefSeq_genes.gtf'
73 | #    os.popen('python prepare_geneScore.py -s '+options.s+' --gtf '+gtf+' --distal '+str(options.distal))
74 | #
75 | #
76 | #
77 | 


--------------------------------------------------------------------------------
/code_v1.0.6/prepare_qualityControl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | import os,numpy,scipy.sparse,scipy.io,pandas,sys
 5 | import matplotlib
 6 | matplotlib.use('Agg')
 7 | import matplotlib.pyplot as plt
 8 | from optparse import OptionParser
 9 | #
10 | #
11 | opts = OptionParser()
12 | usage = "Build QC table\nusage: %prog -s project --pfrag 0.2 --lib 2000"
13 | opts = OptionParser(usage=usage, version="%prog 1.0.6")
14 | opts.add_option("-s", help="The project folder.")
15 | opts.add_option("--pfrag", default=0.2, help="Threshold for percentage of fragments in peaks, "
16 |                 +"default=0.2, decrease it for more filtered-samples, increase it for better quality")
17 | opts.add_option("--lib", default=2000, help="Threshold for fragment number, default=2000, "
18 |                 +"decrease it for more filtered-samples, increase it for better quality")
19 | options, arguments = opts.parse_args()
20 | #
21 | if not os.path.exists(options.s+'/figure'): os.popen('mkdir ' + options.s+'/figure')
22 | frag_thresh = float(options.pfrag)
23 | lib_size_thresh = int(options.lib)
24 | #
25 | reads = scipy.sparse.csr_matrix(scipy.io.mmread(options.s+'/matrix/reads.mtx')).T
26 | cell_info = pandas.read_csv(options.s+'/matrix/cell_info.merged.csv', sep='\t', index_col=0)
27 | libSize = cell_info['final_reads'].values
28 | readsInPeaks = reads.sum(axis=1).A[:,0].astype(float)/cell_info['final_reads'].values
29 | cell_quality = numpy.vstack((libSize, readsInPeaks))
30 | cell_quality_df = pandas.DataFrame(cell_quality.T, index=cell_info.index.values, columns=['lib_size', 'frag_in_peak'])
31 | cell_quality_df.to_csv(options.s+'/matrix/cell_quality.csv', sep='\t')
32 | #
33 | plt.scatter(libSize+1, readsInPeaks, s=10)
34 | plt.vlines(lib_size_thresh, -0.05, 1.05, linestyles='dashed')
35 | plt.hlines(frag_thresh, 1e2, 1e7, linestyles='dashed')
36 | plt.xscale('log')
37 | plt.xlabel('final mapped reads')
38 | plt.ylabel('fragments in peaks(%)')
39 | plt.xlim(1e2, 1e7)
40 | plt.ylim(-0.05, 1.05)
41 | plt.savefig(options.s + '/figure/cell_quality.pdf')
42 | #
43 | filtered_quality_df = cell_quality_df.loc[cell_quality_df['lib_size']>lib_size_thresh]
44 | filtered_quality_df = filtered_quality_df.loc[filtered_quality_df['frag_in_peak']>frag_thresh]
45 | filtered_cell_names = filtered_quality_df.index.values
46 | filtered_cell_index = [list(cell_quality_df.index.values).index(x) for x in filtered_cell_names]
47 | filtered_cells_df = cell_info.loc[filtered_cell_names, 'notes']
48 | filtered_cells_df = pandas.DataFrame(filtered_cells_df.values, index=filtered_cell_names, columns=['notes'])
49 | filtered_cells_df.to_csv(options.s+'/matrix/filtered_cells.csv', sep='\t')
50 | #
51 | nonzero_per_peak = numpy.array([len(numpy.where(x.A>0)[0]) for x in reads.T])
52 | filtered_peak_index = numpy.where(nonzero_per_peak>=3)[0]
53 | with open(options.s+'/peak/top_peaks.bed') as in_file, \
54 |      open(options.s+'/peak/top_filtered_peaks.bed', 'w') as out_file:
55 |     for iline,line in enumerate(in_file):
56 |         if iline in filtered_peak_index:
57 | #            print >> out_file, line[:-1]
58 |             out_file.write(line)
59 | #
60 | motif_df = pandas.read_csv(options.s+'/matrix/motif_TF.csv', sep=',', index_col=0)
61 | motif_df = motif_df.iloc[filtered_peak_index, :]
62 | bias = open(options.s+'/peak/transposase_bias.bed').readlines()
63 | with open(options.s+'/peak/transposase_bias_filtered.bed', 'w') as output:
64 |     for ipeak in filtered_peak_index:
65 | #        print >> output, bias[ipeak][:-1]
66 |         output.write(bias[ipeak])
67 | motif_df.to_csv(options.s+'/matrix/motif_filtered.csv', sep=',')
68 | #
69 | reads = reads[filtered_cell_index, :]
70 | reads = reads[:, filtered_peak_index]
71 | scipy.io.mmwrite(options.s+'/matrix/filtered_reads.mtx', scipy.sparse.coo_matrix(reads.T))
72 | #
73 | print(reads.shape, motif_df.shape)
74 | #
75 | #
76 | 


--------------------------------------------------------------------------------
/code_v1.0.6/prepare_trimming.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import warnings
  3 | warnings.filterwarnings("ignore")
  4 | #
  5 | import sys
  6 | import os
  7 | from optparse import OptionParser
  8 | import Levenshtein
  9 | from multiprocessing import Pool
 10 | #
 11 | #
 12 | opts = OptionParser()
 13 | usage = "Trim adapter\nusage: %prog -s project --np 4"
 14 | opts = OptionParser(usage=usage, version="%prog 1.0")
 15 | opts.add_option("-s", help="The project path, should contains <data> folder, where the pair-end sequencing data locate. "
 16 |                           +"If you want to use this code to build cell_info.csv file, each fastq file should be names as:"
 17 |                           +"type1-001_1.fastq, type1-001_2.fastq, type1-002_1.fastq, type1-002_2.fastq, ... ;"
 18 |                           +"type2-001_1.fastq, type2-001_2.fastq, type2-002_1.fastq, type2-002_2.fastq, ... ; etc. "
 19 |                           +"{type1, type2, ..., typeN} can be cell-types for your samples, such as {GM, K562, ...}, "
 20 |                           +"or you can just use any name you want, but make sure there is no underline(_) or dashline(-) in typeX.")
 21 | opts.add_option("--qlen", default=20, help="Query length for adatper trimming, default=20.")
 22 | opts.add_option("--aseq", default="CTGTCTCTTATACACATCTGACGCTGCCGACGA", help="Adapter sequence, "
 23 |                           +"default=CTGTCTCTTATACACATCTGACGCTGCCGACGA.")
 24 | opts.add_option("--np", default=1, help="Number of CPUs used for trimming in parallel, default=1.")
 25 | options, arguments = opts.parse_args()
 26 | #
 27 | #
 28 | global query_length, adatper_seq
 29 | query_length = options.qlen
 30 | adapter_seq = options.aseq
 31 | #
 32 | #
 33 | def mismatch_align(seq1, query_length, read2_rc):
 34 |     for s1 in range(len(seq1)-query_length+1, -1, -1):
 35 |         temp_read1 = seq1[s1:(s1+query_length)]
 36 |         editdist = Levenshtein.distance(temp_read1, read2_rc)
 37 |         if editdist<2:
 38 |             return s1
 39 |     return -1
 40 | #
 41 | #
 42 | def rev_comp_dna(read2_rc):
 43 |     temp_read2 = ''
 44 |     for i in range(len(read2_rc)-1, -1, -1):
 45 |         if (read2_rc[i]=='A') | (read2_rc[i]=='a') :
 46 |             temp_read2 += 'T'
 47 |         elif (read2_rc[i]=='C') | (read2_rc[i]=='c') :
 48 |             temp_read2 += 'G'
 49 |         elif (read2_rc[i]=='G') | (read2_rc[i]=='g') :
 50 |             temp_read2 += 'C'
 51 |         elif (read2_rc[i]=='T') | (read2_rc[i]=='t') :
 52 |             temp_read2 += 'A'
 53 |         elif read2_rc[i]=='N':
 54 |             temp_read2 += 'N'
 55 |         else:
 56 |             return 'error'
 57 |     return temp_read2
 58 | #
 59 | #
 60 | def trim_adapters(fastq):
 61 |     cutoff = 50
 62 |     fastq1, fastq2 = fastq + '_1.fastq', fastq + '_2.fastq'
 63 |     trimed1, trimed2 = fastq + '_1.trim.fastq', fastq + '_2.trim.fastq'
 64 |     with open(fastq1) as fa1, open(fastq2) as fa2, open(trimed1, 'w') as out1, open(trimed2, 'w') as out2 :
 65 |         nReads, mm0_num_read, mm1_num_read = 0, 0, 0
 66 |         while 1:
 67 |             seq_header1, seq_header2 = fa1.readline()[:-1], fa2.readline()[:-1]
 68 |             seq1, seq2 = fa1.readline()[:-1], fa2.readline()[:-1]
 69 |             qual_header1, qual_header2 = fa1.readline()[:-1], fa2.readline()[:-1]
 70 |             qual1, qual2 = fa1.readline()[:-1], fa2.readline()[:-1]
 71 |             nReads += 1
 72 |             if ((not seq_header1) | (not seq_header2) | (not seq1) | (not seq2) |
 73 |                 (not qual_header1) | (not qual_header2) | (not qual1) | (not qual2)): break
 74 |             read2_rc = seq2[:query_length]
 75 |             read2_rc = rev_comp_dna(read2_rc)
 76 |             s1_pos = -1
 77 |             s1_pos_find = seq1.rfind(read2_rc)
 78 |             if s1_pos_find > 0 :
 79 |                 s1_pos = s1_pos_find
 80 |                 mm0_num_read += 1
 81 |             else:
 82 |                 s1_pos = mismatch_align(seq1, query_length, read2_rc)
 83 |                 if s1_pos>0: mm1_num_read += 1
 84 |             if s1_pos >= 0 :
 85 |                 seq_len = s1_pos + query_length
 86 |                 trim_seq1 = seq1[seq_len:]
 87 |                 adapter_trim_seq = adapter_seq[:len(trim_seq1)]
 88 |                 if adapter_trim_seq==trim_seq1:
 89 |                     seq1 = seq1[:seq_len]
 90 |                     seq2 = seq2[:seq_len]
 91 |                     qual1 = qual1[:seq_len]
 92 |                     qual2 = qual2[:seq_len]
 93 | #            print >> out1, seq_header1
 94 | #            print >> out1, seq1[:cutoff]
 95 | #            print >> out1, qual_header1
 96 | #            print >> out1, qual1[:cutoff]
 97 | #            print >> out2, seq_header2
 98 | #            print >> out2, seq2[:cutoff]
 99 | #            print >> out2, qual_header2
100 | #            print >> out2, qual2[:cutoff]
101 |             out1.write(seq_header1+'\n')
102 |             out1.write(seq1[:cutoff]+'\n')
103 |             out1.write(qual_header1+'\n')
104 |             out1.write(qual1[:cutoff]+'\n')
105 |             out2.write(seq_header2+'\n')
106 |             out2.write(seq2[:cutoff]+'\n')
107 |             out2.write(qual_header2+'\n')
108 |             out2.write(qual2[:cutoff]+'\n')
109 |     return nReads, mm0_num_read, mm1_num_read
110 | #
111 | #
112 | fastqs = [x.split('_')[0] for x in os.listdir(options.s+'/data/') if (x[-6:]=='.fastq')&(x[-11:]!='.trim.fastq')]
113 | fastqs = list(set(fastqs))
114 | fastqs.sort()
115 | pathes = [options.s+'/data/'+x for x in fastqs]
116 | pool = Pool(int(options.np))
117 | read_info = pool.map(trim_adapters, pathes)
118 | pool.close()
119 | pool.join()
120 | #
121 | with open(options.s+'/data/cell_info.csv', 'w') as output:
122 | #    print >> output, 'name\tnotes'
123 |     output.write('name\tnotes\n')
124 |     for fastq in fastqs:
125 | #        print >> output, fastq + '\t' + '-'.join(fastq.split('-')[:-1])
126 |         output.write(fastq + '\t' + '-'.join(fastq.split('-')[:-1]) + '\n')
127 | #
128 | #
129 | #
130 | #
131 | 


--------------------------------------------------------------------------------
/code_v1.1.0/APEC_prepare_steps.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | #
 3 | #### input parameters defined by users #############################################
 4 | #
 5 | ARGS=`getopt -o hr:s:g:n:l:p:f: -l help,raw:,project:,genome:,np:,logq:,pfrag:,frag: -- "$@"`
 6 | if [ $? != 0 ] ; then echo "Terminating..." >&2 ; exit 1 ; fi
 7 | eval set -- "$ARGS"
 8 | while true ; do
 9 |     case "$1" in
10 |         -h|--help)
11 |            echo "
12 | bash APEC_prepare_steps.sh -r raw_data -s project -g genome_index -n nCPUs -l logq -p pfrag -f frag
13 |      -r/--raw:      The raw_data path, where the pair-end sequencing data locate.
14 |      -s/--project:  The project path.
15 |      -g/--genome:   hg19 or mm10.
16 |      -n/--np:       Number of CPU cores.
17 |      -l/--logq:     Threshold for the -log(Q-value) of peaks, used to filter peaks.
18 |      -p/--pfrag:    Threshold of the percentage of fragments in peaks, used to filter cells.
19 |      -f/--frag:     Threshold of the fragment number of each cell, used to filter cells."
20 |            exit 1 ;;
21 |         -r|--raw) raw_data="$2" ; shift 2;;
22 |         -s|--project) project="$2" ; shift 2;;
23 |         -g|--genome) genome="$2" ; shift 2;;
24 |         -n|--np) np="$2" ; shift 2;;
25 |         -l|--logq) logq="$2" ; shift 2;;
26 |         -p|--pfrag) pfrag="$2" ; shift 2;;
27 |         -f|--frag) frag="$2" ; shift 2;;
28 |         --) shift; break ;;
29 |         *) echo "unknown parameter: {$1}" ; exit 1 ;;
30 |     esac
31 | done
32 | #
33 | picard=../reference/picard.jar
34 | ref=$genome
35 | fa="../reference/"$genome"_chr.fa"
36 | index="../reference/"$genome
37 | tss="../reference/"$genome"_refseq_genes_TSS.txt"
38 | if [[ $genome == "hg19" ]]; then
39 |     blist=../reference/hg19_blacklist.JDB.bed
40 | elif [[ $genome == "mm10" ]]; then
41 |     blist=../reference/mm10_blacklist.BIN.bed
42 | fi
43 | gtf="../reference/"$genome"_RefSeq_genes.gtf"
44 | #
45 | #
46 | #
47 | #### processes to prepare raw data ###########
48 | #
49 | python prepare_trimming.py -r $raw_data -s $project --np $np
50 | #
51 | python prepare_mapping.py -s $project --index $index --picard $picard --tss $tss --np $np
52 | #
53 | python prepare_peakCalling.py -s $project --blist $blist --fa $fa --tss $tss --ref $ref --logq $logq
54 | #
55 | python prepare_countMatrix.py -s $project --fa $fa --np $np
56 | #
57 | python prepare_qualityControl.py -s $project --pfrag $pfrag --lib $frag
58 | #
59 | #
60 | #
61 | #
62 | 


--------------------------------------------------------------------------------
/code_v1.1.0/generate_UCSCtrack.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | #
 5 | import os,numpy,pandas,subprocess
 6 | from optparse import OptionParser
 7 | #
 8 | #
 9 | opts = OptionParser()
10 | usage = "Merge counts of cells to track files\nusage: %prog -s project --cfile cluster.csv --gsize chrom.sizes"
11 | opts = OptionParser(usage=usage, version="%prog 1.0")
12 | opts.add_option("-s", help="The project folder.")
13 | opts.add_option("--cfile", help="cluster.csv file, e.g. louvain_cluster_by_APEC.csv in <result> folder")
14 | opts.add_option("--gsize", default='../reference/hg19.chrom.sizes', help="chrom.size files, default=../reference/hg19.chrom.sizes")
15 | options, arguments = opts.parse_args()
16 | #
17 | #
18 | def merge_bam(options):
19 |     bam_folder = [x for x in os.listdir(options.s+'/work')]
20 |     bam_folder.sort()
21 |     cell_df = pandas.read_csv(options.cfile, sep='\t', index_col=0)
22 |     if 'notes' in cell_df.columns.values: cell_df['cluster'] = cell_df['notes']
23 |     cell_types = cell_df['cluster'].values
24 |     cell_types = list(set(cell_types))
25 |     for cell_type in cell_types:
26 |         marked_bam, select = [], []
27 |         for folder in bam_folder:
28 |             path = options.s + '/work/' + folder + '/'
29 |             if folder in cell_df.index.values:
30 |                 if cell_df.ix[folder, 'cluster']==cell_type:
31 |                     select.append(folder)
32 |                     marked_bam.extend([path + x for x in os.listdir(path) if x[-10:]=='marked.bam'])
33 |         marked_bam = ' '.join(marked_bam)
34 |         merged_bam = options.s + '/result/track/' + str(cell_type) + '.bam'
35 |         subprocess.check_call('samtools merge -f ' + merged_bam + ' ' + marked_bam, shell=True)
36 |         subprocess.check_call('samtools index ' + merged_bam, shell=True)
37 |     return
38 | #
39 | #
40 | def bam2bw(options):
41 |     cell_df = pandas.read_csv(options.cfile, sep='\t', index_col=0)
42 |     if 'notes' in cell_df.columns.values: cell_df['cluster'] = cell_df['notes']
43 |     cell_types = cell_df['cluster'].values
44 |     cell_types = list(set(cell_types))
45 |     for cell_type in cell_types:
46 |         name = options.s+'/result/track/'+str(cell_type)
47 |         cells = cell_df.loc[cell_df['cluster']==cell_type]
48 |         cells = cells.index.values
49 |         subprocess.check_call('bedtools genomecov -bg -ibam '+name+'.bam -g '+options.gsize+' > '+name+'.bedgraph', shell=True)
50 |         subprocess.check_call('bedtools sort -i '+name+'.bedgraph > '+name+'.sorted.bedgraph', shell=True)
51 |         counts = numpy.array([int(x.split()[3]) for x in open(name+'.sorted.bedgraph').readlines()])
52 |         total = counts.sum()
53 |         with open(name+'.sorted.bedgraph') as infile, open(name+'.norm.bedgraph', 'w') as outfile:
54 |             for line in infile:
55 |                 words = line.split()
56 |                 words[3] = str(round(float(words[3]) * 100.0 / len(cells)))
57 |                 outfile.write('\t'.join(words)+'\n')
58 |         subprocess.check_call('bedGraphToBigWig '+name+'.norm.bedgraph '+options.gsize+' '+name+'.bw', shell=True)
59 |     return
60 | #
61 | #
62 | #
63 | if not os.path.exists(options.s+'/result'): subprocess.check_call('mkdir '+options.s+'/result', shell=True)
64 | if not os.path.exists(options.s+'/result/track'): subprocess.check_call('mkdir '+options.s+'/result/track', shell=True)
65 | merge_bam(options)
66 | bam2bw(options)
67 | #
68 | #
69 | #
70 | 


--------------------------------------------------------------------------------
/code_v1.1.0/prepare_countMatrix.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | import os,pandas,subprocess
 5 | from optparse import OptionParser
 6 | import subroutines
 7 | import scipy.io,scipy.sparse
 8 | #
 9 | opts = OptionParser()
10 | usage = "Align reads to build matrix\nusage: %prog -s project --fa chr.fa --bg bg --meme motif.meme --np 4"
11 | opts = OptionParser(usage=usage, version="%prog 1.0")
12 | opts.add_option("-s", help="The project folder.")
13 | opts.add_option("--fa", default='../reference/hg19_chr.fa', help="Genome fasta file, default=../reference/hg19_chr.fa")
14 | opts.add_option("--bg", default='../reference/tier1_markov1.norc.txt',
15 |                 help="Background file, default=../reference/tier1_markov1.norc.txt")
16 | opts.add_option("--pvalue", default=0.00005, help="P-value threshold for FIMO, default=0.00005")
17 | opts.add_option("--meme", default='../reference/JASPAR2018_CORE_vertebrates_redundant_pfms_meme.txt',
18 |                 help="Motif file, default=../reference/JASPAR2018_CORE_vertebrates_redundant_pfms_meme.txt")
19 | opts.add_option("--np", default=1, help="Number of CPU cores used for motif searching, default=1")
20 | options, arguments = opts.parse_args()
21 | #
22 | matrix_folder = options.s + '/matrix/'
23 | peaks_folder = options.s + '/peak/'
24 | work_folder = options.s + '/work/'
25 | motif_folder = matrix_folder + '/motif'
26 | peaks_file = peaks_folder + '/top_peaks.bed'
27 | cell_info = options.s + '/data/cell_info.csv'
28 | motifFasta = matrix_folder + '/motif.fasta'
29 | TFmatrix_file = matrix_folder + '/motif_TF.csv'
30 | bam_file = peaks_folder + "/mergeAll.bam"
31 | reads_matrix = matrix_folder + "/reads.mtx"
32 | #
33 | if not os.path.exists(matrix_folder): subprocess.check_call('mkdir ' + matrix_folder, shell=True)
34 | #
35 | #### count reads for peaks
36 | cell_info_df = pandas.read_csv(cell_info, sep='\t', index_col=0)
37 | matrix = subroutines.counts_per_peak(bam_file, peaks_file, reads_matrix, cell_info_df)
38 | matrix = scipy.sparse.coo_matrix(matrix.T)
39 | scipy.io.mmwrite(reads_matrix, matrix)
40 | #
41 | #
42 | subroutines.QC_table(cell_info, work_folder, matrix_folder)
43 | #
44 | #
45 | 


--------------------------------------------------------------------------------
/code_v1.1.0/prepare_mapping.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | import os,sys,subprocess
 5 | from optparse import OptionParser
 6 | import subroutines
 7 | from multiprocessing import Pool
 8 | #
 9 | #
10 | opts = OptionParser()
11 | usage = "Map sequencing data\nusage: %prog -s project --index bowtie2-index --picard picard.jar --tss TSS.txt --np 4"
12 | opts = OptionParser(usage=usage, version="%prog 1.0")
13 | opts.add_option("-s", help="The project folder")
14 | opts.add_option("--index", default="../reference/hg19",
15 |                 help="The reference file path in bowtie2/indexes folder, default=../reference/hg19")
16 | opts.add_option("--picard", default="../reference/picard.jar",
17 |                 help="The picard.jar file path, default=../reference/picard.jar")
18 | opts.add_option("--tss", default="../reference/hg19_refseq_genes_TSS.txt",
19 |                 help="The TSS file path, can be downloaded from RefSeq, default=../reference/hg19_refseq_genes_TSS.txt")
20 | opts.add_option("--np", default=1, help="Number of CPUs used for mapping, default=1")
21 | options, arguments = opts.parse_args()
22 | #
23 | #
24 | #
25 | def mapping(par):
26 |     work_dir, cell, options, input1, input2, chr_list = par[0], par[1], par[2], par[3], par[4], par[5]
27 |     sam = work_dir + cell + '.sam'
28 |     bam = work_dir + cell + '.bam'
29 |     log = work_dir + cell + '.map.log'
30 |     sorted_bam = work_dir + cell + '.sorted.bam'
31 |     filtered_bam = work_dir + cell + '.filtered.bam'
32 |     marked_bam = work_dir + cell + '.marked.bam'
33 |     removed_duplicate = work_dir + cell + '.dups.log'
34 |     quality_state = work_dir + cell + '.stats.log'
35 |     hist_log = work_dir + cell + '.hist.log'
36 |     hist_pdf = work_dir + cell + '.hist.pdf'
37 |     refSeqTSS = work_dir + cell + '.RefSeqTSS'
38 | #
39 | #    subprocess.check_call('bowtie2 -X2000 -p ' + options.np + ' --rg-id ' + cell + ' -x ' + options.index
40 | #        + ' -1 ' + input1 + ' -2 ' + input2 + ' -S ' + sam + ' 2> ' + log, shell=True)
41 |     subprocess.check_call('bowtie2 -X2000 -p 1 --rg-id ' + cell + ' -x ' + options.index
42 |         + ' -1 ' + input1 + ' -2 ' + input2 + ' -S ' + sam + ' 2> ' + log, shell=True)
43 |     subprocess.check_call('samtools view -bS ' + sam + ' -o ' + bam, shell=True)
44 |     subprocess.check_call('rm ' + sam, shell=True)
45 |     subprocess.check_call('java -XX:+UseSerialGC -Xmx1g -jar ' + options.picard + ' SortSam SO=coordinate VALIDATION_STRINGENCY=SILENT I='
46 |         + bam + ' O=' + sorted_bam, shell=True)
47 |     subprocess.check_call('samtools index ' + sorted_bam, shell=True)
48 |     subprocess.check_call('samtools view -b -q 30 ' + sorted_bam + ' -o ' + filtered_bam + ' ' + chr_list, shell=True)
49 |     subprocess.check_call('java -XX:+UseSerialGC -Xmx1g -jar ' + options.picard + ' MarkDuplicates INPUT=' + filtered_bam +' OUTPUT='
50 |         + marked_bam + ' METRICS_FILE=' + removed_duplicate
51 |         + ' REMOVE_DUPLICATES=true VALIDATION_STRINGENCY=SILENT', shell=True)
52 |     subprocess.check_call('samtools index ' + marked_bam, shell=True)
53 |     subprocess.check_call('echo -e "Chromosome\tLength\tProperPairs\tBadPairs:Raw" >> ' + quality_state, shell=True)
54 |     subprocess.check_call('samtools idxstats ' + sorted_bam + ' >> ' + quality_state, shell=True)
55 |     subprocess.check_call('java -XX:+UseSerialGC -Xmx1g -jar ' + options.picard + ' CollectInsertSizeMetrics VALIDATION_STRINGENCY=SILENT I='
56 |         + marked_bam + ' O=' + hist_log + ' H=' + hist_pdf + ' W=1000', shell=True)
57 | #    subroutines.draw_TSS_insert(options.tss, marked_bam, refSeqTSS)
58 |     return
59 | #
60 | #
61 | chr_list = 'chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY'
62 | #
63 | fastq_list = [x for x in os.listdir(options.s+'/data') if x[-11:]=='.trim.fastq']
64 | fastq_list.sort()
65 | cell_list = [x.split('_')[0] for x in fastq_list]
66 | cell_list = list(set(cell_list))
67 | cell_list.sort()
68 | if not os.path.exists(options.s+'/work'): subprocess.check_call('mkdir ' + options.s + '/work', shell=True)
69 | #
70 | parameters = []
71 | for cell in cell_list:
72 |     work_dir = options.s + '/work/' + cell + '/'
73 |     if os.path.exists(work_dir): subprocess.check_call('rm -rf ' + work_dir, shell=True)
74 |     subprocess.check_call('mkdir ' + work_dir, shell=True)
75 |     input1, input2 = options.s+'/data/'+cell+'_1.trim.fastq', options.s+'/data/'+cell+'_2.trim.fastq'
76 |     par = [work_dir, cell, options, input1, input2, chr_list]
77 |     parameters.append(par)
78 | ####    mapping(par)
79 | #
80 | pool = Pool(int(options.np))
81 | pool.map(mapping, parameters)
82 | pool.close()
83 | pool.join()
84 | #
85 | #
86 | for cell in cell_list:
87 |     work_dir = options.s + '/work/' + cell + '/'
88 |     marked_bam = work_dir + cell + '.marked.bam'
89 |     refSeqTSS = work_dir + cell + '.RefSeqTSS'
90 |     subroutines.draw_TSS_insert(options.tss, marked_bam, refSeqTSS)
91 | #
92 | #
93 | 


--------------------------------------------------------------------------------
/code_v1.1.0/prepare_peakCalling.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | import warnings
  4 | warnings.filterwarnings("ignore")
  5 | #
  6 | import os,numpy,sys,subprocess
  7 | from optparse import OptionParser
  8 | import subroutines
  9 | #
 10 | #
 11 | opts = OptionParser()
 12 | usage = "Call peaks\nusage: %prog -s project --blist blacklist.bed --fa genome_chr.fa --tss tssFile --logq 3"
 13 | opts = OptionParser(usage=usage, version="%prog 1.0")
 14 | opts.add_option("-s", help="The project folder.")
 15 | opts.add_option("--picard", default="../reference/picard.jar",
 16 |                 help="The picard.jar file path, default=../reference/picard.jar")
 17 | opts.add_option("--blist", default='../reference/hg19_blacklist.JDB.bed',
 18 |                 help="Blacklist.bed, default=../reference/hg19_blacklist.JDB.bed")
 19 | opts.add_option("--fa", default='../reference/hg19_chr.fa',
 20 |                 help="Genome_chr.fa, default=../reference/hg19_chr.fa")
 21 | opts.add_option('--tss', default='../reference/hg19_refseq_genes_TSS.txt',
 22 |                 help='TSS file, default=../reference/hg19_refseq_genes_TSS.txt')
 23 | opts.add_option('--ref', default='hg19', help='Name of genome reference, default=hg19')
 24 | opts.add_option('--logq', default='3',
 25 |                 help='Threshold of -log(p-value) for top peaks, default=3.')
 26 | options, arguments = opts.parse_args()
 27 | #
 28 | workspace_folder = options.s + '/work/'
 29 | peak_folder = options.s + '/peak/'
 30 | genome_fasta = options.fa
 31 | tssFile = options.tss
 32 | if not os.path.exists(peak_folder): subprocess.check_call('mkdir ' + peak_folder, shell=True)
 33 | #
 34 | #
 35 | print('!!!!!!  merge all marked bam files  !!!!!!')
 36 | bam_folder = [x for x in os.listdir(workspace_folder)]
 37 | bam_folder.sort()
 38 | print('cells number:', len(bam_folder))
 39 | marked_bam = []
 40 | #merged_raw = peak_folder + 'mergeAll.raw.bam'
 41 | merged_bam = peak_folder + 'mergeAll.bam'
 42 | for folder in bam_folder:
 43 |     path = workspace_folder + folder + '/'
 44 |     if len(folder.split('.'))<=1:
 45 |         marked_bam.extend([path + x for x in os.listdir(path) if x[-10:]=='marked.bam'])
 46 | if len(marked_bam)<=1000:
 47 |     marked_bam = ' '.join(marked_bam)
 48 |     subprocess.check_call('samtools merge -f ' + merged_bam + ' ' + marked_bam, shell=True)
 49 | else:
 50 |     n_batch = len(marked_bam)//1000 + 1
 51 |     temps = []
 52 |     for i_batch in range(0, n_batch):
 53 |         temp_bam = peak_folder+'temp_'+str(i_batch)+'.bam'
 54 |         temps.append(temp_bam)
 55 |         start, end = i_batch*1000, min((i_batch+1)*1000, len(marked_bam))
 56 |         marked = ' '.join(marked_bam[start:end])
 57 |         subprocess.check_call('samtools merge -f ' + temp_bam + ' ' + marked, shell=True)
 58 |         subprocess.check_call('samtools index ' + temp_bam, shell=True)
 59 |     all_temp = ' '.join(temps)
 60 |     subprocess.check_call('samtools merge -f ' + merged_bam + ' ' + all_temp, shell=True)
 61 | #
 62 | subprocess.check_call('samtools index ' + merged_bam, shell=True)
 63 | print('!!!!!!  merge done  !!!!!!')
 64 | #
 65 | hist_log = peak_folder + 'mergeAll.hist.log'
 66 | hist_pdf = peak_folder + 'mergeAll.hist.pdf'
 67 | subprocess.check_call('java -XX:+UseSerialGC -Xmx1g -jar '+options.picard+' CollectInsertSizeMetrics VALIDATION_STRINGENCY=SILENT I='
 68 |     + merged_bam + ' O=' + hist_log + ' H=' + hist_pdf + ' W=1000', shell=True)
 69 | #
 70 | refSeqTSS = peak_folder + 'mergeAll.RefSeqTSS'
 71 | subroutines.draw_TSS_insert(tssFile, merged_bam, refSeqTSS)
 72 | #
 73 | print('!!!!!!  call peak by macs2  !!!!!!')
 74 | peak_file = peak_folder + 'peaks'
 75 | subprocess.check_call('macs2 callpeak --nomodel -t ' + merged_bam + ' -n '
 76 |     + peak_file + ' --nolambda --keep-dup all --call-summits', shell=True)
 77 | print('!!!!!!  call peak done  !!!!!!')
 78 | #
 79 | summit = peak_folder + 'peaks_summits.bed'
 80 | filtered_peak = peak_folder + 'filtered_peaks.bed'
 81 | if options.blist:
 82 |     print('!!!!!!  filter peaks  !!!!!!')
 83 |     subprocess.check_call('bedtools intersect -v -a ' + summit + ' -b ' + options.blist
 84 |         + " | sort -k5 -nr > " + filtered_peak, shell=True)
 85 |     print('!!!!!!  filter peaks done  !!!!!!')
 86 | else:
 87 |     subprocess.check_call('sort -k5 -nr ' + summit + ' > ' + filtered_peak, shell=True)
 88 | 
 89 | print('!!!!!!  get top peaks by q-value  !!!!!!')
 90 | fold_rank = numpy.loadtxt(filtered_peak, 'str', delimiter='\t')
 91 | fold_rank[:, 1] = numpy.array(list(map(int, fold_rank[:, 1]))) - 249  # 250
 92 | fold_rank[:, 2] = numpy.array(list(map(int, fold_rank[:, 2]))) + 250
 93 | toppeaks = peak_folder + 'temp01.bed'
 94 | top_peaks = peak_folder + 'top_peaks.bed'
 95 | with open(toppeaks, 'w') as output:
 96 |     for peak in fold_rank:
 97 |         if float(peak[-1])>=float(options.logq):
 98 | #            print >> output, peak[0]+'\t'+peak[1]+'\t'+peak[2]
 99 |             output.write(peak[0]+'\t'+peak[1]+'\t'+peak[2]+'\n')
100 | subprocess.check_call('bedtools sort -i ' + toppeaks + ' > ' + top_peaks, shell=True)
101 | print('!!!!!!  get top peaks done  !!!!!!')
102 | #
103 | #
104 | 


--------------------------------------------------------------------------------
/code_v1.1.0/prepare_qualityControl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | import os,numpy,scipy.sparse,scipy.io,pandas,sys,subprocess
 5 | import matplotlib
 6 | matplotlib.use('Agg')
 7 | import matplotlib.pyplot as plt
 8 | from optparse import OptionParser
 9 | #
10 | #
11 | opts = OptionParser()
12 | usage = "Build QC table\nusage: %prog -s project --pfrag 0.2 --lib 2000"
13 | opts = OptionParser(usage=usage, version="%prog 1.0.6")
14 | opts.add_option("-s", help="The project folder.")
15 | opts.add_option("--pfrag", default=0.2, help="Threshold for percentage of fragments in peaks, "
16 |                 +"default=0.2, decrease it for more filtered-samples, increase it for better quality")
17 | opts.add_option("--lib", default=2000, help="Threshold for fragment number, default=2000, "
18 |                 +"decrease it for more filtered-samples, increase it for better quality")
19 | options, arguments = opts.parse_args()
20 | #
21 | if not os.path.exists(options.s+'/figure'): subprocess.check_call('mkdir ' + options.s+'/figure', shell=True)
22 | frag_thresh = float(options.pfrag)
23 | lib_size_thresh = int(options.lib)
24 | #
25 | reads = scipy.sparse.csr_matrix(scipy.io.mmread(options.s+'/matrix/reads.mtx')).T
26 | cell_info = pandas.read_csv(options.s+'/matrix/cell_info.merged.csv', sep='\t', index_col=0)
27 | libSize = cell_info['final_reads'].values
28 | readsInPeaks = reads.sum(axis=1).A[:,0].astype(float)/cell_info['final_reads'].values
29 | cell_quality = numpy.vstack((libSize, readsInPeaks))
30 | cell_quality_df = pandas.DataFrame(cell_quality.T, index=cell_info.index.values, columns=['lib_size', 'frag_in_peak'])
31 | cell_quality_df.to_csv(options.s+'/matrix/cell_quality.csv', sep='\t')
32 | #
33 | plt.scatter(libSize+1, readsInPeaks, s=10)
34 | plt.vlines(lib_size_thresh, -0.05, 1.05, linestyles='dashed')
35 | plt.hlines(frag_thresh, 1e2, 1e7, linestyles='dashed')
36 | plt.xscale('log')
37 | plt.xlabel('final mapped reads')
38 | plt.ylabel('fragments in peaks(%)')
39 | plt.xlim(1e2, 1e7)
40 | plt.ylim(-0.05, 1.05)
41 | plt.savefig(options.s + '/figure/cell_quality.pdf')
42 | #
43 | filtered_quality_df = cell_quality_df.loc[cell_quality_df['lib_size']>lib_size_thresh]
44 | filtered_quality_df = filtered_quality_df.loc[filtered_quality_df['frag_in_peak']>frag_thresh]
45 | filtered_cell_names = filtered_quality_df.index.values
46 | filtered_cell_index = [list(cell_quality_df.index.values).index(x) for x in filtered_cell_names]
47 | filtered_cells_df = cell_info.loc[filtered_cell_names, 'notes']
48 | filtered_cells_df = pandas.DataFrame(filtered_cells_df.values, index=filtered_cell_names, columns=['notes'])
49 | filtered_cells_df.to_csv(options.s+'/matrix/filtered_cells.csv', sep='\t')
50 | #
51 | nonzero_per_peak = numpy.array([len(numpy.where(x.A>0)[0]) for x in reads.T])
52 | filtered_peak_index = numpy.where(nonzero_per_peak>=3)[0]
53 | with open(options.s+'/peak/top_peaks.bed') as in_file, \
54 |      open(options.s+'/peak/top_filtered_peaks.bed', 'w') as out_file:
55 |     for iline,line in enumerate(in_file):
56 |         if iline in filtered_peak_index:
57 |             out_file.write(line)
58 | #
59 | reads = reads[filtered_cell_index, :]
60 | reads = reads[:, filtered_peak_index]
61 | scipy.io.mmwrite(options.s+'/matrix/filtered_reads.mtx', scipy.sparse.coo_matrix(reads.T))
62 | #
63 | #
64 | 


--------------------------------------------------------------------------------
/code_v1.1.0/prepare_trimming.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import warnings
  3 | warnings.filterwarnings("ignore")
  4 | #
  5 | import sys,os,subprocess
  6 | from optparse import OptionParser
  7 | import Levenshtein
  8 | from multiprocessing import Pool
  9 | #
 10 | #
 11 | opts = OptionParser()
 12 | usage = "Trim adapter\nusage: %prog -r raw_data -s project --np 4"
 13 | opts = OptionParser(usage=usage, version="%prog 1.0")
 14 | opts.add_option("-r", help="The <raw_data> folder, where the pair-end sequencing data locate. "
 15 |                           +"If you want to use this code to build cell_info.csv file, each fastq file should be names as:"
 16 |                           +"type1-001_1.fastq, type1-001_2.fastq, type1-002_1.fastq, type1-002_2.fastq, ... ;"
 17 |                           +"type2-001_1.fastq, type2-001_2.fastq, type2-002_1.fastq, type2-002_2.fastq, ... ; etc. "
 18 |                           +"{type1, type2, ..., typeN} can be cell-types for your samples, such as {GM, K562, ...}, "
 19 |                           +"or you can just use any name you want, but make sure there is no underline(_) or dashline(-) in typeX.")
 20 | opts.add_option("-s", help='The <project> folder.')
 21 | opts.add_option("--qlen", default=20, help="Query length for adatper trimming, default=20.")
 22 | opts.add_option("--aseq", default="CTGTCTCTTATACACATCTGACGCTGCCGACGA", help="Adapter sequence, "
 23 |                           +"default=CTGTCTCTTATACACATCTGACGCTGCCGACGA.")
 24 | opts.add_option("--np", default=1, help="Number of CPUs used for trimming in parallel, default=1.")
 25 | options, arguments = opts.parse_args()
 26 | #
 27 | #
 28 | global query_length, adatper_seq
 29 | query_length = options.qlen
 30 | adapter_seq = options.aseq
 31 | #
 32 | #
 33 | def mismatch_align(seq1, query_length, read2_rc):
 34 |     for s1 in range(len(seq1)-query_length+1, -1, -1):
 35 |         temp_read1 = seq1[s1:(s1+query_length)]
 36 |         editdist = Levenshtein.distance(temp_read1, read2_rc)
 37 |         if editdist<2:
 38 |             return s1
 39 |     return -1
 40 | #
 41 | #
 42 | def rev_comp_dna(read2_rc):
 43 |     temp_read2 = ''
 44 |     for i in range(len(read2_rc)-1, -1, -1):
 45 |         if (read2_rc[i]=='A') | (read2_rc[i]=='a') :
 46 |             temp_read2 += 'T'
 47 |         elif (read2_rc[i]=='C') | (read2_rc[i]=='c') :
 48 |             temp_read2 += 'G'
 49 |         elif (read2_rc[i]=='G') | (read2_rc[i]=='g') :
 50 |             temp_read2 += 'C'
 51 |         elif (read2_rc[i]=='T') | (read2_rc[i]=='t') :
 52 |             temp_read2 += 'A'
 53 |         elif read2_rc[i]=='N':
 54 |             temp_read2 += 'N'
 55 |         else:
 56 |             return 'error'
 57 |     return temp_read2
 58 | #
 59 | #
 60 | def trim_adapters(fastq):
 61 |     cutoff = 50
 62 |     fastq1, fastq2 = fastq[0] + '_1.fastq', fastq[0] + '_2.fastq'
 63 |     trimed1, trimed2 = fastq[1] + '_1.trim.fastq', fastq[1] + '_2.trim.fastq'
 64 |     with open(fastq1) as fa1, open(fastq2) as fa2, open(trimed1, 'w') as out1, open(trimed2, 'w') as out2 :
 65 |         nReads, mm0_num_read, mm1_num_read = 0, 0, 0
 66 |         while 1:
 67 |             seq_header1, seq_header2 = fa1.readline()[:-1], fa2.readline()[:-1]
 68 |             seq1, seq2 = fa1.readline()[:-1], fa2.readline()[:-1]
 69 |             qual_header1, qual_header2 = fa1.readline()[:-1], fa2.readline()[:-1]
 70 |             qual1, qual2 = fa1.readline()[:-1], fa2.readline()[:-1]
 71 |             nReads += 1
 72 |             if ((not seq_header1) | (not seq_header2) | (not seq1) | (not seq2) |
 73 |                 (not qual_header1) | (not qual_header2) | (not qual1) | (not qual2)): break
 74 |             read2_rc = seq2[:query_length]
 75 |             read2_rc = rev_comp_dna(read2_rc)
 76 |             s1_pos = -1
 77 |             s1_pos_find = seq1.rfind(read2_rc)
 78 |             if s1_pos_find > 0 :
 79 |                 s1_pos = s1_pos_find
 80 |                 mm0_num_read += 1
 81 |             else:
 82 |                 s1_pos = mismatch_align(seq1, query_length, read2_rc)
 83 |                 if s1_pos>0: mm1_num_read += 1
 84 |             if s1_pos >= 0 :
 85 |                 seq_len = s1_pos + query_length
 86 |                 trim_seq1 = seq1[seq_len:]
 87 |                 adapter_trim_seq = adapter_seq[:len(trim_seq1)]
 88 |                 if adapter_trim_seq==trim_seq1:
 89 |                     seq1 = seq1[:seq_len]
 90 |                     seq2 = seq2[:seq_len]
 91 |                     qual1 = qual1[:seq_len]
 92 |                     qual2 = qual2[:seq_len]
 93 |             out1.write(seq_header1+'\n')
 94 |             out1.write(seq1[:cutoff]+'\n')
 95 |             out1.write(qual_header1+'\n')
 96 |             out1.write(qual1[:cutoff]+'\n')
 97 |             out2.write(seq_header2+'\n')
 98 |             out2.write(seq2[:cutoff]+'\n')
 99 |             out2.write(qual_header2+'\n')
100 |             out2.write(qual2[:cutoff]+'\n')
101 |     return nReads, mm0_num_read, mm1_num_read
102 | #
103 | if not os.path.exists(options.s+'/data'): subprocess.check_call('mkdir '+options.s+'/data', shell=True)
104 | #
105 | fastqs = [x.split('_')[0] for x in os.listdir(options.r) if (x[-6:]=='.fastq')&(x[-11:]!='.trim.fastq')]
106 | fastqs = list(set(fastqs))
107 | fastqs.sort()
108 | pathes = [[options.r+'/'+x, options.s+'/data/'+x] for x in fastqs]
109 | pool = Pool(int(options.np))
110 | read_info = pool.map(trim_adapters, pathes)
111 | pool.close()
112 | pool.join()
113 | #
114 | with open(options.s+'/data/cell_info.csv', 'w') as output:
115 | #    print >> output, 'name\tnotes'
116 |     output.write('name\tnotes\n')
117 |     for fastq in fastqs:
118 | #        print >> output, fastq + '\t' + '-'.join(fastq.split('-')[:-1])
119 |         output.write(fastq + '\t' + '-'.join(fastq.split('-')[:-1]) + '\n')
120 | #
121 | #
122 | #
123 | #
124 | 


--------------------------------------------------------------------------------
/code_v1.1.0/subroutines.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuKunLab/APEC/e15d6c42e05b2deb9c6b89f678905c6ef9878e2b/code_v1.1.0/subroutines.pyc


--------------------------------------------------------------------------------
/code_v1.2/APEC_prepare_steps.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | #
 3 | #### input parameters defined by users #############################################
 4 | #
 5 | ARGS=`getopt -o hr:s:g:n:l:p:f: -l help,raw:,project:,genome:,np:,logq:,pfrag:,frag: -- "$@"`
 6 | if [ $? != 0 ] ; then echo "Terminating..." >&2 ; exit 1 ; fi
 7 | eval set -- "$ARGS"
 8 | while true ; do
 9 |     case "$1" in
10 |         -h|--help)
11 |            echo "
12 | bash APEC_prepare_steps.sh -r raw_data -s project -g genome_index -n nCPUs -l logq -p pfrag -f frag
13 |      -r/--raw:      The raw_data path, where the pair-end sequencing data locate.
14 |      -s/--project:  The project path.
15 |      -g/--genome:   hg19 or mm10.
16 |      -n/--np:       Number of CPU cores.
17 |      -l/--logq:     Threshold for the -log(Q-value) of peaks, used to filter peaks.
18 |      -p/--pfrag:    Threshold of the percentage of fragments in peaks, used to filter cells.
19 |      -f/--frag:     Threshold of the fragment number of each cell, used to filter cells."
20 |            exit 1 ;;
21 |         -r|--raw) raw_data="$2" ; shift 2;;
22 |         -s|--project) project="$2" ; shift 2;;
23 |         -g|--genome) genome="$2" ; shift 2;;
24 |         -n|--np) np="$2" ; shift 2;;
25 |         -l|--logq) logq="$2" ; shift 2;;
26 |         -p|--pfrag) pfrag="$2" ; shift 2;;
27 |         -f|--frag) frag="$2" ; shift 2;;
28 |         --) shift; break ;;
29 |         *) echo "unknown parameter: {$1}" ; exit 1 ;;
30 |     esac
31 | done
32 | #
33 | picard=../reference/picard.jar
34 | ref=$genome
35 | fa="../reference/"$genome"_chr.fa"
36 | index="../reference/"$genome
37 | tss="../reference/"$genome"_refseq_genes_TSS.txt"
38 | if [[ $genome == "hg19" ]]; then
39 |     blist=../reference/hg19_blacklist.JDB.bed
40 | elif [[ $genome == "mm10" ]]; then
41 |     blist=../reference/mm10_blacklist.BIN.bed
42 | fi
43 | gtf="../reference/"$genome"_RefSeq_genes.gtf"
44 | #
45 | #
46 | #
47 | #### processes to prepare raw data ###########
48 | #
49 | python prepare_trimming.py -r $raw_data -s $project --np $np
50 | #
51 | python prepare_mapping.py -s $project --index $index --picard $picard --tss $tss --np $np
52 | #
53 | python prepare_peakCalling.py -s $project --blist $blist --fa $fa --tss $tss --ref $ref --logq $logq
54 | #
55 | python prepare_countMatrix.py -s $project --fa $fa --np $np
56 | #
57 | python prepare_qualityControl.py -s $project --pfrag $pfrag --lib $frag
58 | #
59 | #
60 | #
61 | #
62 | 


--------------------------------------------------------------------------------
/code_v1.2/generate_UCSCtrack.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | #
 5 | import os,numpy,pandas,subprocess
 6 | from optparse import OptionParser
 7 | #
 8 | #
 9 | opts = OptionParser()
10 | usage = "Merge counts of cells to track files\nusage: %prog -s project --cfile cluster.csv --gsize chrom.sizes"
11 | opts = OptionParser(usage=usage, version="%prog 1.0")
12 | opts.add_option("-s", help="The project folder.")
13 | opts.add_option("--cfile", help="cluster.csv file, e.g. louvain_cluster_by_APEC.csv in <result> folder")
14 | opts.add_option("--gsize", default='../reference/hg19.chrom.sizes', help="chrom.size files, default=../reference/hg19.chrom.sizes")
15 | options, arguments = opts.parse_args()
16 | #
17 | #
18 | def merge_bam(options):
19 |     bam_folder = [x for x in os.listdir(options.s+'/work')]
20 |     bam_folder.sort()
21 |     cell_df = pandas.read_csv(options.cfile, sep='\t', index_col=0)
22 |     if 'notes' in cell_df.columns.values: cell_df['cluster'] = cell_df['notes']
23 |     cell_types = cell_df['cluster'].values
24 |     cell_types = list(set(cell_types))
25 |     for cell_type in cell_types:
26 |         marked_bam, select = [], []
27 |         for folder in bam_folder:
28 |             path = options.s + '/work/' + folder + '/'
29 |             if folder in cell_df.index.values:
30 |                 if cell_df.ix[folder, 'cluster']==cell_type:
31 |                     select.append(folder)
32 |                     marked_bam.extend([path + x for x in os.listdir(path) if x[-10:]=='marked.bam'])
33 |         marked_bam = ' '.join(marked_bam)
34 |         merged_bam = options.s + '/result/track/' + str(cell_type) + '.bam'
35 |         subprocess.check_call('samtools merge -f ' + merged_bam + ' ' + marked_bam, shell=True)
36 |         subprocess.check_call('samtools index ' + merged_bam, shell=True)
37 |     return
38 | #
39 | #
40 | def bam2bw(options):
41 |     cell_df = pandas.read_csv(options.cfile, sep='\t', index_col=0)
42 |     if 'notes' in cell_df.columns.values: cell_df['cluster'] = cell_df['notes']
43 |     cell_types = cell_df['cluster'].values
44 |     cell_types = list(set(cell_types))
45 |     for cell_type in cell_types:
46 |         name = options.s+'/result/track/'+str(cell_type)
47 |         cells = cell_df.loc[cell_df['cluster']==cell_type]
48 |         cells = cells.index.values
49 |         subprocess.check_call('bedtools genomecov -bg -ibam '+name+'.bam -g '+options.gsize+' > '+name+'.bedgraph', shell=True)
50 |         subprocess.check_call('bedtools sort -i '+name+'.bedgraph > '+name+'.sorted.bedgraph', shell=True)
51 |         counts = numpy.array([int(x.split()[3]) for x in open(name+'.sorted.bedgraph').readlines()])
52 |         total = counts.sum()
53 |         with open(name+'.sorted.bedgraph') as infile, open(name+'.norm.bedgraph', 'w') as outfile:
54 |             for line in infile:
55 |                 words = line.split()
56 |                 words[3] = str(round(float(words[3]) * 100.0 / len(cells)))
57 |                 outfile.write('\t'.join(words)+'\n')
58 |         subprocess.check_call('bedGraphToBigWig '+name+'.norm.bedgraph '+options.gsize+' '+name+'.bw', shell=True)
59 |     return
60 | #
61 | #
62 | #
63 | if not os.path.exists(options.s+'/result'): subprocess.check_call('mkdir '+options.s+'/result', shell=True)
64 | if not os.path.exists(options.s+'/result/track'): subprocess.check_call('mkdir '+options.s+'/result/track', shell=True)
65 | merge_bam(options)
66 | bam2bw(options)
67 | #
68 | #
69 | #
70 | 


--------------------------------------------------------------------------------
/code_v1.2/prepare_countMatrix.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | import os,pandas,subprocess
 5 | from optparse import OptionParser
 6 | import subroutines
 7 | import scipy.io,scipy.sparse
 8 | #
 9 | opts = OptionParser()
10 | usage = "Align reads to build matrix\nusage: %prog -s project --fa chr.fa --bg bg --meme motif.meme --np 4"
11 | opts = OptionParser(usage=usage, version="%prog 1.0")
12 | opts.add_option("-s", help="The project folder.")
13 | opts.add_option("--fa", default='../reference/hg19_chr.fa', help="Genome fasta file, default=../reference/hg19_chr.fa")
14 | opts.add_option("--bg", default='../reference/tier1_markov1.norc.txt',
15 |                 help="Background file, default=../reference/tier1_markov1.norc.txt")
16 | opts.add_option("--pvalue", default=0.00005, help="P-value threshold for FIMO, default=0.00005")
17 | opts.add_option("--meme", default='../reference/JASPAR2018_CORE_vertebrates_redundant_pfms_meme.txt',
18 |                 help="Motif file, default=../reference/JASPAR2018_CORE_vertebrates_redundant_pfms_meme.txt")
19 | opts.add_option("--np", default=1, help="Number of CPU cores used for motif searching, default=1")
20 | options, arguments = opts.parse_args()
21 | #
22 | matrix_folder = options.s + '/matrix/'
23 | peaks_folder = options.s + '/peak/'
24 | work_folder = options.s + '/work/'
25 | motif_folder = matrix_folder + '/motif'
26 | peaks_file = peaks_folder + '/top_peaks.bed'
27 | cell_info = options.s + '/data/cell_info.csv'
28 | motifFasta = matrix_folder + '/motif.fasta'
29 | TFmatrix_file = matrix_folder + '/motif_TF.csv'
30 | bam_file = peaks_folder + "/mergeAll.bam"
31 | reads_matrix = matrix_folder + "/reads.mtx"
32 | #
33 | if not os.path.exists(matrix_folder): subprocess.check_call('mkdir ' + matrix_folder, shell=True)
34 | #
35 | #### count reads for peaks
36 | cell_info_df = pandas.read_csv(cell_info, sep='\t', index_col=0)
37 | matrix = subroutines.counts_per_peak(bam_file, peaks_file, reads_matrix, cell_info_df)
38 | matrix = scipy.sparse.coo_matrix(matrix.T)
39 | scipy.io.mmwrite(reads_matrix, matrix)
40 | #
41 | #
42 | subroutines.QC_table(cell_info, work_folder, matrix_folder)
43 | #
44 | #
45 | 


--------------------------------------------------------------------------------
/code_v1.2/prepare_mapping.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | import os,sys,subprocess
 5 | from optparse import OptionParser
 6 | import subroutines
 7 | from multiprocessing import Pool
 8 | #
 9 | #
10 | opts = OptionParser()
11 | usage = "Map sequencing data\nusage: %prog -s project --index bowtie2-index --picard picard.jar --tss TSS.txt --np 4"
12 | opts = OptionParser(usage=usage, version="%prog 1.0")
13 | opts.add_option("-s", help="The project folder")
14 | opts.add_option("--index", default="../reference/hg19",
15 |                 help="The reference file path in bowtie2/indexes folder, default=../reference/hg19")
16 | opts.add_option("--picard", default="../reference/picard.jar",
17 |                 help="The picard.jar file path, default=../reference/picard.jar")
18 | opts.add_option("--tss", default="../reference/hg19_refseq_genes_TSS.txt",
19 |                 help="The TSS file path, can be downloaded from RefSeq, default=../reference/hg19_refseq_genes_TSS.txt")
20 | opts.add_option("--np", default=1, help="Number of CPUs used for mapping, default=1")
21 | options, arguments = opts.parse_args()
22 | #
23 | #
24 | #
25 | def mapping(par):
26 |     work_dir, cell, options, input1, input2, chr_list = par[0], par[1], par[2], par[3], par[4], par[5]
27 |     sam = work_dir + cell + '.sam'
28 |     bam = work_dir + cell + '.bam'
29 |     log = work_dir + cell + '.map.log'
30 |     sorted_bam = work_dir + cell + '.sorted.bam'
31 |     filtered_bam = work_dir + cell + '.filtered.bam'
32 |     marked_bam = work_dir + cell + '.marked.bam'
33 |     removed_duplicate = work_dir + cell + '.dups.log'
34 |     quality_state = work_dir + cell + '.stats.log'
35 |     hist_log = work_dir + cell + '.hist.log'
36 |     hist_pdf = work_dir + cell + '.hist.pdf'
37 |     refSeqTSS = work_dir + cell + '.RefSeqTSS'
38 | #
39 | #    subprocess.check_call('bowtie2 -X2000 -p ' + options.np + ' --rg-id ' + cell + ' -x ' + options.index
40 | #        + ' -1 ' + input1 + ' -2 ' + input2 + ' -S ' + sam + ' 2> ' + log, shell=True)
41 |     subprocess.check_call('bowtie2 -X2000 -p 1 --rg-id ' + cell + ' -x ' + options.index
42 |         + ' -1 ' + input1 + ' -2 ' + input2 + ' -S ' + sam + ' 2> ' + log, shell=True)
43 |     subprocess.check_call('samtools view -bS ' + sam + ' -o ' + bam, shell=True)
44 |     subprocess.check_call('rm ' + sam, shell=True)
45 |     subprocess.check_call('java -XX:+UseSerialGC -Xmx1g -jar ' + options.picard + ' SortSam SO=coordinate VALIDATION_STRINGENCY=SILENT I='
46 |         + bam + ' O=' + sorted_bam, shell=True)
47 |     subprocess.check_call('samtools index ' + sorted_bam, shell=True)
48 |     subprocess.check_call('samtools view -b -q 30 ' + sorted_bam + ' -o ' + filtered_bam + ' ' + chr_list, shell=True)
49 |     subprocess.check_call('java -XX:+UseSerialGC -Xmx1g -jar ' + options.picard + ' MarkDuplicates INPUT=' + filtered_bam +' OUTPUT='
50 |         + marked_bam + ' METRICS_FILE=' + removed_duplicate
51 |         + ' REMOVE_DUPLICATES=true VALIDATION_STRINGENCY=SILENT', shell=True)
52 |     subprocess.check_call('samtools index ' + marked_bam, shell=True)
53 |     subprocess.check_call('echo -e "Chromosome\tLength\tProperPairs\tBadPairs:Raw" >> ' + quality_state, shell=True)
54 |     subprocess.check_call('samtools idxstats ' + sorted_bam + ' >> ' + quality_state, shell=True)
55 |     subprocess.check_call('java -XX:+UseSerialGC -Xmx1g -jar ' + options.picard + ' CollectInsertSizeMetrics VALIDATION_STRINGENCY=SILENT I='
56 |         + marked_bam + ' O=' + hist_log + ' H=' + hist_pdf + ' W=1000', shell=True)
57 | #    subroutines.draw_TSS_insert(options.tss, marked_bam, refSeqTSS)
58 |     return
59 | #
60 | #
61 | chr_list = 'chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY'
62 | #
63 | fastq_list = [x for x in os.listdir(options.s+'/data') if x[-11:]=='.trim.fastq']
64 | fastq_list.sort()
65 | cell_list = [x.split('_')[0] for x in fastq_list]
66 | cell_list = list(set(cell_list))
67 | cell_list.sort()
68 | if not os.path.exists(options.s+'/work'): subprocess.check_call('mkdir ' + options.s + '/work', shell=True)
69 | #
70 | parameters = []
71 | for cell in cell_list:
72 |     work_dir = options.s + '/work/' + cell + '/'
73 |     if os.path.exists(work_dir): subprocess.check_call('rm -rf ' + work_dir, shell=True)
74 |     subprocess.check_call('mkdir ' + work_dir, shell=True)
75 |     input1, input2 = options.s+'/data/'+cell+'_1.trim.fastq', options.s+'/data/'+cell+'_2.trim.fastq'
76 |     par = [work_dir, cell, options, input1, input2, chr_list]
77 |     parameters.append(par)
78 | ####    mapping(par)
79 | #
80 | pool = Pool(int(options.np))
81 | pool.map(mapping, parameters)
82 | pool.close()
83 | pool.join()
84 | #
85 | #
86 | for cell in cell_list:
87 |     work_dir = options.s + '/work/' + cell + '/'
88 |     marked_bam = work_dir + cell + '.marked.bam'
89 |     refSeqTSS = work_dir + cell + '.RefSeqTSS'
90 |     subroutines.draw_TSS_insert(options.tss, marked_bam, refSeqTSS)
91 | #
92 | #
93 | 


--------------------------------------------------------------------------------
/code_v1.2/prepare_peakCalling.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | import warnings
  4 | warnings.filterwarnings("ignore")
  5 | #
  6 | import os,numpy,sys,subprocess
  7 | from optparse import OptionParser
  8 | import subroutines
  9 | #
 10 | #
 11 | opts = OptionParser()
 12 | usage = "Call peaks\nusage: %prog -s project --blist blacklist.bed --fa genome_chr.fa --tss tssFile --logq 3"
 13 | opts = OptionParser(usage=usage, version="%prog 1.0")
 14 | opts.add_option("-s", help="The project folder.")
 15 | opts.add_option("--picard", default="../reference/picard.jar",
 16 |                 help="The picard.jar file path, default=../reference/picard.jar")
 17 | opts.add_option("--blist", default='../reference/hg19_blacklist.JDB.bed',
 18 |                 help="Blacklist.bed, default=../reference/hg19_blacklist.JDB.bed")
 19 | opts.add_option("--fa", default='../reference/hg19_chr.fa',
 20 |                 help="Genome_chr.fa, default=../reference/hg19_chr.fa")
 21 | opts.add_option('--tss', default='../reference/hg19_refseq_genes_TSS.txt',
 22 |                 help='TSS file, default=../reference/hg19_refseq_genes_TSS.txt')
 23 | opts.add_option('--ref', default='hg19', help='Name of genome reference, default=hg19')
 24 | opts.add_option('--logq', default='3',
 25 |                 help='Threshold of -log(p-value) for top peaks, default=3.')
 26 | options, arguments = opts.parse_args()
 27 | #
 28 | workspace_folder = options.s + '/work/'
 29 | peak_folder = options.s + '/peak/'
 30 | genome_fasta = options.fa
 31 | tssFile = options.tss
 32 | if not os.path.exists(peak_folder): subprocess.check_call('mkdir ' + peak_folder, shell=True)
 33 | #
 34 | #
 35 | print('!!!!!!  merge all marked bam files  !!!!!!')
 36 | bam_folder = [x for x in os.listdir(workspace_folder)]
 37 | bam_folder.sort()
 38 | print('cells number:', len(bam_folder))
 39 | marked_bam = []
 40 | #merged_raw = peak_folder + 'mergeAll.raw.bam'
 41 | merged_bam = peak_folder + 'mergeAll.bam'
 42 | for folder in bam_folder:
 43 |     path = workspace_folder + folder + '/'
 44 |     if len(folder.split('.'))<=1:
 45 |         marked_bam.extend([path + x for x in os.listdir(path) if x[-10:]=='marked.bam'])
 46 | if len(marked_bam)<=1000:
 47 |     marked_bam = ' '.join(marked_bam)
 48 |     subprocess.check_call('samtools merge -f ' + merged_bam + ' ' + marked_bam, shell=True)
 49 | else:
 50 |     n_batch = len(marked_bam)//1000 + 1
 51 |     temps = []
 52 |     for i_batch in range(0, n_batch):
 53 |         temp_bam = peak_folder+'temp_'+str(i_batch)+'.bam'
 54 |         temps.append(temp_bam)
 55 |         start, end = i_batch*1000, min((i_batch+1)*1000, len(marked_bam))
 56 |         marked = ' '.join(marked_bam[start:end])
 57 |         subprocess.check_call('samtools merge -f ' + temp_bam + ' ' + marked, shell=True)
 58 |         subprocess.check_call('samtools index ' + temp_bam, shell=True)
 59 |     all_temp = ' '.join(temps)
 60 |     subprocess.check_call('samtools merge -f ' + merged_bam + ' ' + all_temp, shell=True)
 61 | #
 62 | subprocess.check_call('samtools index ' + merged_bam, shell=True)
 63 | print('!!!!!!  merge done  !!!!!!')
 64 | #
 65 | hist_log = peak_folder + 'mergeAll.hist.log'
 66 | hist_pdf = peak_folder + 'mergeAll.hist.pdf'
 67 | subprocess.check_call('java -XX:+UseSerialGC -Xmx1g -jar '+options.picard+' CollectInsertSizeMetrics VALIDATION_STRINGENCY=SILENT I='
 68 |     + merged_bam + ' O=' + hist_log + ' H=' + hist_pdf + ' W=1000', shell=True)
 69 | #
 70 | refSeqTSS = peak_folder + 'mergeAll.RefSeqTSS'
 71 | subroutines.draw_TSS_insert(tssFile, merged_bam, refSeqTSS)
 72 | #
 73 | print('!!!!!!  call peak by macs2  !!!!!!')
 74 | peak_file = peak_folder + 'peaks'
 75 | subprocess.check_call('macs2 callpeak --nomodel -t ' + merged_bam + ' -n '
 76 |     + peak_file + ' --nolambda --keep-dup all --call-summits', shell=True)
 77 | print('!!!!!!  call peak done  !!!!!!')
 78 | #
 79 | summit = peak_folder + 'peaks_summits.bed'
 80 | filtered_peak = peak_folder + 'filtered_peaks.bed'
 81 | if options.blist:
 82 |     print('!!!!!!  filter peaks  !!!!!!')
 83 |     subprocess.check_call('bedtools intersect -v -a ' + summit + ' -b ' + options.blist
 84 |         + " | sort -k5 -nr > " + filtered_peak, shell=True)
 85 |     print('!!!!!!  filter peaks done  !!!!!!')
 86 | else:
 87 |     subprocess.check_call('sort -k5 -nr ' + summit + ' > ' + filtered_peak, shell=True)
 88 | 
 89 | print('!!!!!!  get top peaks by q-value  !!!!!!')
 90 | fold_rank = numpy.loadtxt(filtered_peak, 'str', delimiter='\t')
 91 | fold_rank[:, 1] = numpy.array(list(map(int, fold_rank[:, 1]))) - 249  # 250
 92 | fold_rank[:, 2] = numpy.array(list(map(int, fold_rank[:, 2]))) + 250
 93 | toppeaks = peak_folder + 'temp01.bed'
 94 | top_peaks = peak_folder + 'top_peaks.bed'
 95 | with open(toppeaks, 'w') as output:
 96 |     for peak in fold_rank:
 97 |         if float(peak[-1])>=float(options.logq):
 98 | #            print >> output, peak[0]+'\t'+peak[1]+'\t'+peak[2]
 99 |             output.write(peak[0]+'\t'+peak[1]+'\t'+peak[2]+'\n')
100 | subprocess.check_call('bedtools sort -i ' + toppeaks + ' > ' + top_peaks, shell=True)
101 | print('!!!!!!  get top peaks done  !!!!!!')
102 | #
103 | #
104 | 


--------------------------------------------------------------------------------
/code_v1.2/prepare_qualityControl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import warnings
 3 | warnings.filterwarnings("ignore")
 4 | import os,numpy,scipy.sparse,scipy.io,pandas,sys,subprocess
 5 | import matplotlib
 6 | matplotlib.use('Agg')
 7 | import matplotlib.pyplot as plt
 8 | from optparse import OptionParser
 9 | #
10 | #
11 | opts = OptionParser()
12 | usage = "Build QC table\nusage: %prog -s project --pfrag 0.2 --lib 2000"
13 | opts = OptionParser(usage=usage, version="%prog 1.0.6")
14 | opts.add_option("-s", help="The project folder.")
15 | opts.add_option("--pfrag", default=0.2, help="Threshold for percentage of fragments in peaks, "
16 |                 +"default=0.2, decrease it for more filtered-samples, increase it for better quality")
17 | opts.add_option("--lib", default=2000, help="Threshold for fragment number, default=2000, "
18 |                 +"decrease it for more filtered-samples, increase it for better quality")
19 | options, arguments = opts.parse_args()
20 | #
21 | if not os.path.exists(options.s+'/figure'): subprocess.check_call('mkdir ' + options.s+'/figure', shell=True)
22 | frag_thresh = float(options.pfrag)
23 | lib_size_thresh = int(options.lib)
24 | #
25 | reads = scipy.sparse.csr_matrix(scipy.io.mmread(options.s+'/matrix/reads.mtx')).T
26 | cell_info = pandas.read_csv(options.s+'/matrix/cell_info.merged.csv', sep='\t', index_col=0)
27 | libSize = cell_info['final_reads'].values
28 | readsInPeaks = reads.sum(axis=1).A[:,0].astype(float)/cell_info['final_reads'].values
29 | cell_quality = numpy.vstack((libSize, readsInPeaks))
30 | cell_quality_df = pandas.DataFrame(cell_quality.T, index=cell_info.index.values, columns=['lib_size', 'frag_in_peak'])
31 | cell_quality_df.to_csv(options.s+'/matrix/cell_quality.csv', sep='\t')
32 | #
33 | plt.scatter(libSize+1, readsInPeaks, s=10)
34 | plt.vlines(lib_size_thresh, -0.05, 1.05, linestyles='dashed')
35 | plt.hlines(frag_thresh, 1e2, 1e7, linestyles='dashed')
36 | plt.xscale('log')
37 | plt.xlabel('final mapped reads')
38 | plt.ylabel('fragments in peaks(%)')
39 | plt.xlim(1e2, 1e7)
40 | plt.ylim(-0.05, 1.05)
41 | plt.savefig(options.s + '/figure/cell_quality.pdf')
42 | #
43 | filtered_quality_df = cell_quality_df.loc[cell_quality_df['lib_size']>lib_size_thresh]
44 | filtered_quality_df = filtered_quality_df.loc[filtered_quality_df['frag_in_peak']>frag_thresh]
45 | filtered_cell_names = filtered_quality_df.index.values
46 | filtered_cell_index = [list(cell_quality_df.index.values).index(x) for x in filtered_cell_names]
47 | filtered_cells_df = cell_info.loc[filtered_cell_names, 'notes']
48 | filtered_cells_df = pandas.DataFrame(filtered_cells_df.values, index=filtered_cell_names, columns=['notes'])
49 | filtered_cells_df.to_csv(options.s+'/matrix/filtered_cells.csv', sep='\t')
50 | #
51 | nonzero_per_peak = numpy.array([len(numpy.where(x.A>0)[0]) for x in reads.T])
52 | filtered_peak_index = numpy.where(nonzero_per_peak>=3)[0]
53 | with open(options.s+'/peak/top_peaks.bed') as in_file, \
54 |      open(options.s+'/peak/top_filtered_peaks.bed', 'w') as out_file:
55 |     for iline,line in enumerate(in_file):
56 |         if iline in filtered_peak_index:
57 |             out_file.write(line)
58 | #
59 | reads = reads[filtered_cell_index, :]
60 | reads = reads[:, filtered_peak_index]
61 | scipy.io.mmwrite(options.s+'/matrix/filtered_reads.mtx', scipy.sparse.coo_matrix(reads.T))
62 | #
63 | #
64 | 


--------------------------------------------------------------------------------
/code_v1.2/prepare_trimming.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import warnings
  3 | warnings.filterwarnings("ignore")
  4 | #
  5 | import sys,os,subprocess
  6 | from optparse import OptionParser
  7 | import Levenshtein
  8 | from multiprocessing import Pool
  9 | #
 10 | #
 11 | opts = OptionParser()
 12 | usage = "Trim adapter\nusage: %prog -r raw_data -s project --np 4"
 13 | opts = OptionParser(usage=usage, version="%prog 1.0")
 14 | opts.add_option("-r", help="The <raw_data> folder, where the pair-end sequencing data locate. "
 15 |                           +"If you want to use this code to build cell_info.csv file, each fastq file should be names as:"
 16 |                           +"type1-001_1.fastq, type1-001_2.fastq, type1-002_1.fastq, type1-002_2.fastq, ... ;"
 17 |                           +"type2-001_1.fastq, type2-001_2.fastq, type2-002_1.fastq, type2-002_2.fastq, ... ; etc. "
 18 |                           +"{type1, type2, ..., typeN} can be cell-types for your samples, such as {GM, K562, ...}, "
 19 |                           +"or you can just use any name you want, but make sure there is no underline(_) or dashline(-) in typeX.")
 20 | opts.add_option("-s", help='The <project> folder.')
 21 | opts.add_option("--qlen", default=20, help="Query length for adatper trimming, default=20.")
 22 | opts.add_option("--aseq", default="CTGTCTCTTATACACATCTGACGCTGCCGACGA", help="Adapter sequence, "
 23 |                           +"default=CTGTCTCTTATACACATCTGACGCTGCCGACGA.")
 24 | opts.add_option("--np", default=1, help="Number of CPUs used for trimming in parallel, default=1.")
 25 | options, arguments = opts.parse_args()
 26 | #
 27 | #
 28 | global query_length, adatper_seq
 29 | query_length = options.qlen
 30 | adapter_seq = options.aseq
 31 | #
 32 | #
 33 | def mismatch_align(seq1, query_length, read2_rc):
 34 |     for s1 in range(len(seq1)-query_length+1, -1, -1):
 35 |         temp_read1 = seq1[s1:(s1+query_length)]
 36 |         editdist = Levenshtein.distance(temp_read1, read2_rc)
 37 |         if editdist<2:
 38 |             return s1
 39 |     return -1
 40 | #
 41 | #
 42 | def rev_comp_dna(read2_rc):
 43 |     temp_read2 = ''
 44 |     for i in range(len(read2_rc)-1, -1, -1):
 45 |         if (read2_rc[i]=='A') | (read2_rc[i]=='a') :
 46 |             temp_read2 += 'T'
 47 |         elif (read2_rc[i]=='C') | (read2_rc[i]=='c') :
 48 |             temp_read2 += 'G'
 49 |         elif (read2_rc[i]=='G') | (read2_rc[i]=='g') :
 50 |             temp_read2 += 'C'
 51 |         elif (read2_rc[i]=='T') | (read2_rc[i]=='t') :
 52 |             temp_read2 += 'A'
 53 |         elif read2_rc[i]=='N':
 54 |             temp_read2 += 'N'
 55 |         else:
 56 |             return 'error'
 57 |     return temp_read2
 58 | #
 59 | #
 60 | def trim_adapters(fastq):
 61 |     cutoff = 50
 62 |     fastq1, fastq2 = fastq[0] + '_1.fastq', fastq[0] + '_2.fastq'
 63 |     trimed1, trimed2 = fastq[1] + '_1.trim.fastq', fastq[1] + '_2.trim.fastq'
 64 |     with open(fastq1) as fa1, open(fastq2) as fa2, open(trimed1, 'w') as out1, open(trimed2, 'w') as out2 :
 65 |         nReads, mm0_num_read, mm1_num_read = 0, 0, 0
 66 |         while 1:
 67 |             seq_header1, seq_header2 = fa1.readline()[:-1], fa2.readline()[:-1]
 68 |             seq1, seq2 = fa1.readline()[:-1], fa2.readline()[:-1]
 69 |             qual_header1, qual_header2 = fa1.readline()[:-1], fa2.readline()[:-1]
 70 |             qual1, qual2 = fa1.readline()[:-1], fa2.readline()[:-1]
 71 |             nReads += 1
 72 |             if ((not seq_header1) | (not seq_header2) | (not seq1) | (not seq2) |
 73 |                 (not qual_header1) | (not qual_header2) | (not qual1) | (not qual2)): break
 74 |             read2_rc = seq2[:query_length]
 75 |             read2_rc = rev_comp_dna(read2_rc)
 76 |             s1_pos = -1
 77 |             s1_pos_find = seq1.rfind(read2_rc)
 78 |             if s1_pos_find > 0 :
 79 |                 s1_pos = s1_pos_find
 80 |                 mm0_num_read += 1
 81 |             else:
 82 |                 s1_pos = mismatch_align(seq1, query_length, read2_rc)
 83 |                 if s1_pos>0: mm1_num_read += 1
 84 |             if s1_pos >= 0 :
 85 |                 seq_len = s1_pos + query_length
 86 |                 trim_seq1 = seq1[seq_len:]
 87 |                 adapter_trim_seq = adapter_seq[:len(trim_seq1)]
 88 |                 if adapter_trim_seq==trim_seq1:
 89 |                     seq1 = seq1[:seq_len]
 90 |                     seq2 = seq2[:seq_len]
 91 |                     qual1 = qual1[:seq_len]
 92 |                     qual2 = qual2[:seq_len]
 93 |             out1.write(seq_header1+'\n')
 94 |             out1.write(seq1[:cutoff]+'\n')
 95 |             out1.write(qual_header1+'\n')
 96 |             out1.write(qual1[:cutoff]+'\n')
 97 |             out2.write(seq_header2+'\n')
 98 |             out2.write(seq2[:cutoff]+'\n')
 99 |             out2.write(qual_header2+'\n')
100 |             out2.write(qual2[:cutoff]+'\n')
101 |     return nReads, mm0_num_read, mm1_num_read
102 | #
103 | if not os.path.exists(options.s+'/data'): subprocess.check_call('mkdir '+options.s+'/data', shell=True)
104 | #
105 | fastqs = [x.split('_')[0] for x in os.listdir(options.r) if (x[-6:]=='.fastq')&(x[-11:]!='.trim.fastq')]
106 | fastqs = list(set(fastqs))
107 | fastqs.sort()
108 | pathes = [[options.r+'/'+x, options.s+'/data/'+x] for x in fastqs]
109 | pool = Pool(int(options.np))
110 | read_info = pool.map(trim_adapters, pathes)
111 | pool.close()
112 | pool.join()
113 | #
114 | with open(options.s+'/data/cell_info.csv', 'w') as output:
115 | #    print >> output, 'name\tnotes'
116 |     output.write('name\tnotes\n')
117 |     for fastq in fastqs:
118 | #        print >> output, fastq + '\t' + '-'.join(fastq.split('-')[:-1])
119 |         output.write(fastq + '\t' + '-'.join(fastq.split('-')[:-1]) + '\n')
120 | #
121 | #
122 | #
123 | #
124 | 


--------------------------------------------------------------------------------
/code_v1.2/subroutines.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuKunLab/APEC/e15d6c42e05b2deb9c6b89f678905c6ef9878e2b/code_v1.2/subroutines.pyc


--------------------------------------------------------------------------------
/examples/.gitattributes:
--------------------------------------------------------------------------------
1 | project01/matrix/filtered_reads.mtx filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | #### If users want to entirely reproduce the figures in APEC paper (https://www.biorxiv.org/content/10.1101/646331v4), please create a new python 3.6.8 environment in anaconda/miniconda, and install APEC in the environment. APEC will install specific versions of the following packages:
 3 | 
 4 |     numpy==1.16.2
 5 |     scipy==1.0.0
 6 |     pandas==0.24.2
 7 |     matplotlib==3.0.3
 8 |     seaborn==0.9.0
 9 |     numba==0.43.1
10 |     networkx==2.2
11 |     python-louvain==0.11
12 |     scikit-learn==0.20.0
13 |     MulticoreTsne==0.1
14 |     umap-learn==0.3.8
15 |     rpy2==2.8.5
16 | 
17 | After downloading project01.tar.gz, project02.tar.gz, and project03.tar.gz, please use following commands to unzip them:
18 | 
19 |     tar -zvxf project01.tar.gz
20 |     tar -zvxf project02.tar.gz
21 |     tar -zvxf project03.tar.gz
22 | 
23 | ## (1) project01
24 | 
25 | Project01 contains single cell samples from hematopoietic stem cell differentiation project (including HSC, MPP, CMP, GMP, MEP, LMPP, CLP, pDC, and UNK cells), from "Buenrostro, J.D. et al. Integrated Single-Cell Analysis Maps the Continuous Regulatory Landscape of Human Hematopoietic Differentiation. Cell 173, 1535-1548 e1516 (2018)". Users can run **script_for_project01.py** to obtain Fig. 1b, 1d, 3c, and 3d of the APEC paper, as follows:
26 | 
27 |     python script_for_project01.py -p $project01 -r $reference
28 | 
29 | where $project01 is the **project01** folder, and $reference is the **reference** folder. The clustering step will cost ~10 minutes, and the entire script will cost ~20 minutes. The output figures will be placed in $project01/figure/ folder, including:
30 | 
31 | output figure|figure index in APEC paper
32 | -|-
33 | TSNE_by_APEC_with_notes_label.pdf|Fig. 1b
34 | motif_XXX_on_tsne_by_APEC.pdf|Fig. 1d
35 | pseudotime_trajectory_with_notes_label.pdf|Fig. 3c
36 | motif_XXX_on_trajectory_by_APEC.pdf|Fig. 3d
37 | 
38 | **NOTE**: Monocle is very sensitive to the input matrix. For example, when we use different versions of scipy (such as 1.0.0 and 1.2.1), the PCA matrices generated by sklearn will be slightly different (difference < 1.0e-7), and even this small difference will produce completely different shapes of trajectories by monocle. Different versions of many other related libraries/packages in python and R environment will also result in different shapes of trajectories.
39 | 
40 | 
41 | ## (2) project02
42 | 
43 | Project02 contains single cell samples from the forebrain of adult mice, from "Preissl, S. et al. Single-nucleus analysis of accessible chromatin in developing mouse forebrain reveals cell-type-specific transcriptional regulation. Nat Neurosci 21, 432-439 (2018)". Users can run **script_for_project02.py** to obtain Fig. 2a, 2b, and 2e of the APEC paper, as follows:
44 | 
45 |     python script_for_project02.py -p $project02 -r $reference
46 | 
47 | where $project02 is the **project02** folder, and $reference is the **reference** folder. The clustering step will cost ~90 minutes, and the entire script will cost ~2 hours. The output figures will be placed in $project02/figure/ folder, including:
48 | 
49 | output figure|figure index in APEC paper
50 | -|-
51 | TSNE_by_APEC_with_cluster_label.pdf|Fig. 2a
52 | cell_cell_correlation_by_APEC_with_cluster_label.png|Fig. 2b
53 | motif_XXX_on_tsne_by_APEC.pdf|Fig. 2e
54 | 
55 | In addition, if users want to re-plot Fig. 2c, 2d, 2f, and 2g of the APEC paper, they can run **script_python_for_Figure_2c_2d.py** and **script_python_for_Figure_2f_2g.py** **after** the running of "script_for_project02.py", as follows:
56 | 
57 |     python script_python_for_Figure_2c_2d.py -p $project02
58 |     python script_python_for_Figure_2f_2g.py -p $project02
59 | 
60 | The output figures will be placed in the same folder as the two scripts, including:
61 | 
62 | output figure|figure index in APEC paper
63 | -|-
64 | cluster_vs_gene.png|Fig. 2c
65 | cluster_pearson_corr.png|Fig. 2d
66 | clusters_vs_Excl_types.png|Fig. 2f
67 | clusters_vs_Int_types.png|Fig. 2g
68 | 
69 | 
70 | ## (3) project03
71 | 
72 | Project03 contains single cell samples of the leukemic stem and blast cells of two different patients (SU070 and SU353), from "Schep, A.N., Wu, B., Buenrostro, J.D. & Greenleaf, W.J. chromVAR: inferring transcription-factor-associated accessibility from single-cell epigenomic data. Nat Methods 14, 975-978 (2017)". Users can run **script_for_project03.py** to obtain Supplementary Fig. 2b of the APEC paper, as follows:
73 | 
74 |     python script_for_project03.py -p $project03
75 | 
76 | where $project03 is the **project03** folder. The entire script will cost ~5 minutes. The output figure will be placed in $project03/figure/ folder, which is:
77 | 
78 | output figure|figure index in APEC paper
79 | -|-
80 | TSNE_by_APEC_with_notes_label.pdf|Supplementary Fig. 2b
81 | 
82 | For the fragment count matrix with many noises, please consider to use the following script for cell clustering:
83 | 
84 |     clustering.cluster_byAccesson($project03, norm='filter')
85 | 


--------------------------------------------------------------------------------
/examples/project01.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuKunLab/APEC/e15d6c42e05b2deb9c6b89f678905c6ef9878e2b/examples/project01.tar.gz


--------------------------------------------------------------------------------
/examples/project02.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuKunLab/APEC/e15d6c42e05b2deb9c6b89f678905c6ef9878e2b/examples/project02.tar.gz


--------------------------------------------------------------------------------
/examples/project03.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuKunLab/APEC/e15d6c42e05b2deb9c6b89f678905c6ef9878e2b/examples/project03.tar.gz


--------------------------------------------------------------------------------
/examples/script_for_project01.py:
--------------------------------------------------------------------------------
 1 | from APEC import clustering, plot, generate
 2 | from optparse import OptionParser
 3 | opts = OptionParser()
 4 | usage = "Clustering by accesson\nusage: %prog -p project01 -r reference"
 5 | opts = OptionParser(usage=usage, version="%prog 1.0")
 6 | opts.add_option("-p", help="The project01 folder.")
 7 | opts.add_option("-r", help="The reference folder")
 8 | options, arguments = opts.parse_args()
 9 | #
10 | #
11 | #### Cluster cells, plot tSNE map, and calculate ARI; cost ~10 minutes.
12 | #### The output file project01/figure/TSNE_by_APEC_with_notes_label.pdf is Fig. 1b of the APEC paper.
13 | #
14 | clustering.build_accesson(options.p, ngroup=600)
15 | clustering.cluster_byAccesson(options.p, norm='probability')
16 | plot.plot_tsne(options.p)
17 | clustering.cluster_comparison(options.p+'/matrix/filtered_cells.csv',
18 |                               options.p+'/result/cluster_by_APEC.csv',
19 |                               exclude='UNK')
20 | #
21 | #
22 | #### Caculate motif enrichment and plot motifs on tSNE map; cost <1 hour on 8-core CPU.
23 | #### The output files project01/figure/motif_XXX_on_tsne_by_APEC.pdf are Fig. 1d of the APEC paper.
24 | #
25 | # generate.motif_matrix(options.p, genome_fa=options.r+'/hg19_chr.fa',
26 | #                       background=options.r+'/tier1_markov1.norc.txt',
27 | #                       meme=options.r+'/JASPAR2018_CORE_vertebrates_redundant_pfms_meme.txt',
28 | #                       np=8)
29 | # clustering.cluster_byMotif(options.p, np=8)
30 | #
31 | plot.plot_feature(options.p, space='tsne', feature='motif', name='Erg', clip=[0,7])
32 | plot.plot_feature(options.p, space='tsne', feature='motif', name='CTCF', clip=[0,5])
33 | plot.plot_feature(options.p, space='tsne', feature='motif', name='GATA1', clip=[0,25])
34 | #
35 | #
36 | #### Construct trajectory and plot motifs on trajectory; cost ~1 minute.
37 | #### The output file project01/figure/pseudotime_trajectory_with_notes_label.pdf is Fig. 3c of the APEC paper.
38 | #### The output files project01/figure/motif_XXX_on_trajectory_by_APEC.pdf are Fig. 3d of the APEC paper.
39 | #
40 | generate.monocle_trajectory(options.p, around=[6, 25])
41 | plot.plot_trajectory(options.p, angles=[60,90])
42 | #
43 | plot.plot_feature(options.p, space='trajectory', feature='motif', name='Hoxa9', angles=[60,90], clip=[-6,9])
44 | plot.plot_feature(options.p, space='trajectory', feature='motif', name='GATA1', angles=[60,90], clip=[-10,10])
45 | plot.plot_feature(options.p, space='trajectory', feature='motif', name='CEBPB', angles=[60,90], clip=[-10,10])
46 | plot.plot_feature(options.p, space='trajectory', feature='motif', name='TCF4', angles=[60,90], clip=[-5,10])
47 | #
48 | 


--------------------------------------------------------------------------------
/examples/script_for_project02.py:
--------------------------------------------------------------------------------
 1 | from APEC import clustering, plot, generate
 2 | from optparse import OptionParser
 3 | opts = OptionParser()
 4 | usage = "Clustering by accesson\nusage: %prog -p project02 -r reference"
 5 | opts = OptionParser(usage=usage, version="%prog 1.0")
 6 | opts.add_option("-p", help="The project02 folder.")
 7 | opts.add_option("-r", help="The reference folder")
 8 | options, arguments = opts.parse_args()
 9 | #
10 | #
11 | #### Cluster cells, plot tSNE map and cell-cell correlation; cost ~90 minutes.
12 | #### The output file project02/figure/TSNE_by_APEC_with_cluster_label.pdf is Fig. 2a of the APEC paper.
13 | #### The output file project02/figure/cell_cell_correlation_by_APEC_with_cluster_label.png is Fig. 2b of the APEC paper
14 | 
15 | clustering.build_accesson(options.p, ngroup=600)
16 | clustering.cluster_byAccesson(options.p, norm='probability')
17 | plot.plot_tsne(options.p, rs=3)
18 | plot.correlation(options.p, cell_label='cluster', clip=[0,1])
19 | #
20 | #
21 | #### Generate gene scores from ATAC signals; cost ~30 minutes.
22 | #
23 | generate.gene_score(options.p, genome_gtf=options.r+'/mm10_RefSeq_genes.gtf')
24 | #
25 | #
26 | #### Generate differential accessons between sub-clusters of excitatory neurons and inhibitory neurons;
27 | #### cost ~1 minute.
28 | #
29 | log2Fold = 0.848
30 | generate.get_nearby_genes(options.p)
31 | generate.differential_feature(options.p, feature='accesson', target='0', vs='1,10,11,12', log2_fold=log2Fold)
32 | generate.differential_feature(options.p, feature='accesson', target='1', vs='0,10,11,12', log2_fold=log2Fold)
33 | generate.differential_feature(options.p, feature='accesson', target='10', vs='0,1,11,12', log2_fold=log2Fold)
34 | generate.differential_feature(options.p, feature='accesson', target='11', vs='0,1,10,12', log2_fold=log2Fold)
35 | generate.differential_feature(options.p, feature='accesson', target='12', vs='0,1,10,11', log2_fold=log2Fold)
36 | generate.differential_feature(options.p, feature='accesson', target='2', vs='3,5,6,8', log2_fold=0.678)
37 | generate.differential_feature(options.p, feature='accesson', target='3', vs='2,5,6,8', log2_fold=log2Fold)
38 | generate.differential_feature(options.p, feature='accesson', target='5', vs='2,3,6,8', log2_fold=log2Fold)
39 | generate.differential_feature(options.p, feature='accesson', target='6', vs='2,3,5,8', log2_fold=log2Fold)
40 | generate.differential_feature(options.p, feature='accesson', target='8', vs='2,3,5,6', log2_fold=log2Fold)
41 | #
42 | #
43 | #### Caculate motif enrichment; cost ~4 hours on 8-core CPU.
44 | #
45 | # generate.motif_matrix(options.p, genome_fa=options.r+'/mm10_chr.fa',
46 | #                       background=options.r+'/tier1_markov1.norc.txt',
47 | #                       meme=options.r+'/JASPAR2018_CORE_vertebrates_redundant_pfms_meme.txt',
48 | #                       np=8)
49 | # clustering.cluster_byMotif(options.p, np=8)
50 | #
51 | #
52 | #### Plot motifs on the tSNE map; cost ~1 minutes.
53 | #### The output files project02/figure/motif_XXX_on_tsne_by_APEC.pdf are Fig. 2e of the APEC paper.
54 | #
55 | plot.plot_feature(options.p, space='tsne', feature='motif', name='NEUROD1', clip=[-5,10])
56 | plot.plot_feature(options.p, space='tsne', feature='motif', name='OLIG2', clip=[-5,8])
57 | plot.plot_feature(options.p, space='tsne', feature='motif', name='MEF2C', clip=[-5,8])
58 | plot.plot_feature(options.p, space='tsne', feature='motif', name='MEIS2', clip=[-4,6])
59 | plot.plot_feature(options.p, space='tsne', feature='motif', name='Dlx2', clip=[-5,11])
60 | plot.plot_feature(options.p, space='tsne', feature='motif', name='NOTO', clip=[-6,10])
61 | plot.plot_feature(options.p, space='tsne', feature='motif', name='Sox2', clip=[-22,27])
62 | plot.plot_feature(options.p, space='tsne', feature='motif', name='ETS1', clip=[-4,8])
63 | #
64 | #
65 | 


--------------------------------------------------------------------------------
/examples/script_for_project03.py:
--------------------------------------------------------------------------------
 1 | from APEC import clustering, plot, generate
 2 | from optparse import OptionParser
 3 | opts = OptionParser()
 4 | usage = "Clustering by accesson\nusage: %prog -p project03"
 5 | opts = OptionParser(usage=usage, version="%prog 1.0")
 6 | opts.add_option("-p", help="The project03 folder.")
 7 | options, arguments = opts.parse_args()
 8 | #
 9 | #
10 | #### Cluster cells, plot tSNE map, and calculate ARI; cost ~5 minutes.
11 | #### The output file project02/figure/TSNE_by_APEC_with_notes_label.pdf is Fig. S3b of the APEC paper.
12 | #
13 | clustering.build_accesson(options.p, ngroup=700)
14 | clustering.cluster_byAccesson(options.p, norm='zscore')
15 | plot.plot_tsne(options.p, wt=1)
16 | clustering.cluster_comparison(options.p+'/matrix/filtered_cells.csv',
17 |                               options.p+'/result/cluster_by_APEC.csv')
18 | #
19 | 


--------------------------------------------------------------------------------
/examples/script_python_for_Figure_2c_2d.py:
--------------------------------------------------------------------------------
  1 | import os,sys,numpy
  2 | import pandas
  3 | import matplotlib
  4 | matplotlib.use('Agg')
  5 | import matplotlib.pyplot as plt
  6 | import seaborn
  7 | import scipy.stats
  8 | from optparse import OptionParser
  9 | opts = OptionParser()
 10 | usage = "Clustering by accesson\nusage: %prog -p project02"
 11 | opts = OptionParser(usage=usage, version="%prog 1.0")
 12 | opts.add_option("-p", help="The project02 folder.")
 13 | options, arguments = opts.parse_args()
 14 | #
 15 | #
 16 | def get_matrix(markers, gene_csv, cluster_csv):
 17 |     gene_df = pandas.read_csv(gene_csv, sep=',', index_col=0,
 18 |                               engine='c', na_filter=False, low_memory=False).T
 19 |     cluster_df = pandas.read_csv(cluster_csv, sep='\t', index_col=0)
 20 |     clusters = list(set(cluster_df['cluster'].values))
 21 |     matrix, reads = [], []
 22 |     for cluster in clusters:
 23 |         cells = cluster_df.loc[cluster_df['cluster']==cluster].index.values
 24 |         expr = gene_df.loc[markers, cells].values.mean(axis=1)
 25 |         matrix.append(expr)
 26 |         reads.append(gene_df[cells].values.mean(axis=1))
 27 |     matrix = numpy.array(matrix)
 28 |     matrix_df = pandas.DataFrame(matrix, index=['c_'+str(x) for x in clusters], columns=markers)
 29 |     matrix_df.T.to_csv('cluster_vs_genes.csv', sep=',')
 30 |     return
 31 | #
 32 | #
 33 | def plot_heatmap(matrix_csv, cluster_order, gene_order, out_fig):
 34 |     matrix_df = pandas.read_csv(matrix_csv, sep=',', index_col=0)
 35 |     zscore = scipy.stats.zscore(matrix_df.values, axis=1)
 36 |     zscore = scipy.stats.zscore(zscore, axis=0)
 37 |     z_df = pandas.DataFrame(zscore, index=matrix_df.index, columns=matrix_df.columns)
 38 |     xx = [0, 520, 1040, 1260, 1380]
 39 |     yy = [0, 520, 1040, 1460, 1880]
 40 |     xclust = [500, 500, 200, 100, 100]
 41 |     ygene = [500, 500, 400, 400, 300]
 42 |     fig0 = plt.figure(figsize=(10,17))
 43 |     x_clust = 0
 44 |     for ic,cluster in enumerate(cluster_order):
 45 |         y_gene = 0
 46 |         for ig,gene in enumerate(gene_order):
 47 |             ax = plt.subplot2grid((2200, 1500),
 48 |                  (yy[ig], xx[ic]), rowspan=ygene[ig], colspan=xclust[ic])
 49 |             y_gene += len(gene)
 50 |             im = ax.imshow(z_df.loc[gene,cluster], cmap='bwr', aspect='auto',
 51 |                            interpolation='none', vmax=2.0, vmin=-2)
 52 |             if y_gene==len(z_df.index.values):
 53 |                 ax.set_xticks(numpy.arange(0, len(cluster)))
 54 |                 ax.set_xticklabels(cluster, fontsize=15)
 55 |             else:
 56 |                 ax.set_xticklabels([])
 57 |             if ic==0:
 58 |                 ax.set_yticks(numpy.arange(0, len(gene)))
 59 |                 ax.set_yticklabels(gene, fontsize=15)
 60 |             else:
 61 |                 ax.set_yticklabels([])
 62 |         x_clust += len(cluster)
 63 |     plt.savefig(out_fig, bbox_inches='tight')
 64 |     plt.close()
 65 |     return
 66 | #
 67 | #
 68 | def pearson(csv, clusts, outfig):
 69 |     matrix_df = pandas.read_csv(csv, sep=',', index_col=0)
 70 |     matrix_df = matrix_df[clusts]
 71 |     seaborn.set_context('poster')
 72 |     corr = matrix_df.corr()
 73 |     seaborn.clustermap(corr, method='ward', cmap='YlOrRd')
 74 |     plt.savefig(outfig, bbox_inches='tight')
 75 |     plt.close()
 76 |     return
 77 | #
 78 | #
 79 | #### Please run script_for_project02.py first !!!!
 80 | #
 81 | #
 82 | gene_score_file = options.p+'/matrix/genes_scored_by_TSS_peaks.csv'
 83 | #
 84 | if not os.path.exists(gene_score_file):
 85 |     print('Error !!!!')
 86 |     print('Please run script_for_project02.py first !!!!')
 87 |     sys.exit()
 88 | #
 89 | #
 90 | #### These are marker genes reported by Preissl et al.
 91 | markers = ['Neurod1', 'Neurod2', 'Neurod6', 'Tbr1', 'Slc17a7',
 92 |            'Gad1', 'Gad2', 'Slc32a1', 'Dlx1', 'Dlx5',
 93 |            'Bcan', 'Aldh1l1', 'Slc1a2', 'Slc1a3',
 94 |            'Mobp', 'Mag', 'Plp1', 'Mog',
 95 |            'C1qb', 'Ctss', 'Spi1']
 96 | #
 97 | #
 98 | get_matrix(markers, gene_score_file, options.p+'/result/cluster_by_APEC.csv')
 99 | #
100 | c_order = [['c_6', 'c_5', 'c_2', 'c_3', 'c_8'],
101 |            ['c_0', 'c_10', 'c_1', 'c_12', 'c_11'],
102 |            ['c_4', 'c_13'], ['c_9'], ['c_7']]
103 | g_order = [['Neurod1', 'Neurod2', 'Neurod6', 'Tbr1', 'Slc17a7'],
104 |            ['Gad1', 'Gad2', 'Slc32a1', 'Dlx1', 'Dlx5'],
105 |            ['Bcan', 'Aldh1l1', 'Slc1a2', 'Slc1a3'],
106 |            ['Mobp', 'Mag', 'Plp1', 'Mog'],
107 |            ['C1qb', 'Ctss', 'Spi1']]
108 | plot_heatmap('cluster_vs_genes.csv', c_order, g_order, 'cluster_vs_gene.png')
109 | #
110 | clusts = ['c_'+str(x) for x in range(0,14)]
111 | pearson('cluster_vs_genes.csv', clusts, 'cluster_pearson_corr.png')
112 | #
113 | print('output figures:')
114 | print('cluster_vs_gene.png')
115 | print('cluster_pearson_corr.png')
116 | #
117 | 


--------------------------------------------------------------------------------
/images/TSNE_by_APEC_with_cluster_label.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuKunLab/APEC/e15d6c42e05b2deb9c6b89f678905c6ef9878e2b/images/TSNE_by_APEC_with_cluster_label.jpg


--------------------------------------------------------------------------------
/images/TSNE_by_APEC_with_notes_label.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuKunLab/APEC/e15d6c42e05b2deb9c6b89f678905c6ef9878e2b/images/TSNE_by_APEC_with_notes_label.jpg


--------------------------------------------------------------------------------
/images/motif_GATA1_on_trajectory_by_APEC.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuKunLab/APEC/e15d6c42e05b2deb9c6b89f678905c6ef9878e2b/images/motif_GATA1_on_trajectory_by_APEC.jpg


--------------------------------------------------------------------------------
/images/pseudotime_trajectory_with_notes_label.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuKunLab/APEC/e15d6c42e05b2deb9c6b89f678905c6ef9878e2b/images/pseudotime_trajectory_with_notes_label.jpg


--------------------------------------------------------------------------------
/images/workflow.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuKunLab/APEC/e15d6c42e05b2deb9c6b89f678905c6ef9878e2b/images/workflow.jpg


--------------------------------------------------------------------------------
/reference/README.md:
--------------------------------------------------------------------------------
1 | 
2 | Please visit the following website to download all necessory files:
3 | 
4 | http://galaxy.ustc.edu.cn:30803/APEC/
5 | 


--------------------------------------------------------------------------------