├── db.sqlite3 ├── upload └── data ├── vrpg ├── __init__.py ├── models.py ├── admin.py ├── tests.py ├── apps.py └── views.py ├── module ├── __init__.py ├── gz.h ├── makefile ├── gz.cpp ├── ghAnno.cpp ├── gfa2v.h ├── nodeSeq.cpp ├── vgraph.h ├── gaf2rbed.cpp ├── anno.cpp ├── refbed.cpp └── refgene.cpp ├── vrpg_project ├── __init__.py ├── asgi.py ├── wsgi.py ├── urls.py └── settings.py ├── static └── images │ ├── vrpg.png │ ├── window2.png │ ├── zoomIn2.png │ ├── zoomOut2.png │ ├── arrowLeft3.png │ ├── arrowRight3.png │ ├── ref_collapse3.png │ └── question.svg ├── test ├── mc.genome.sh ├── mc.gff.sh ├── pggb.gff.sh ├── prepare.gff.sh ├── pggb.genome.sh ├── prepare.asm.sh ├── download.testData.sh ├── test.track.bed └── TEST_README.md ├── manage.py ├── templates └── vrpg │ └── local.head ├── create.local.py ├── LICENSE ├── host.jslib.local.sh ├── README.md └── script └── vrpg_preprocess.py /db.sqlite3: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /upload/data: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /vrpg/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /module/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /vrpg_project/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vrpg/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | 3 | # Create your models here. 4 | -------------------------------------------------------------------------------- /vrpg/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | -------------------------------------------------------------------------------- /vrpg/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /static/images/vrpg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codeatcg/VRPG/HEAD/static/images/vrpg.png -------------------------------------------------------------------------------- /static/images/window2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codeatcg/VRPG/HEAD/static/images/window2.png -------------------------------------------------------------------------------- /static/images/zoomIn2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codeatcg/VRPG/HEAD/static/images/zoomIn2.png -------------------------------------------------------------------------------- /static/images/zoomOut2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codeatcg/VRPG/HEAD/static/images/zoomOut2.png -------------------------------------------------------------------------------- /static/images/arrowLeft3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codeatcg/VRPG/HEAD/static/images/arrowLeft3.png -------------------------------------------------------------------------------- /static/images/arrowRight3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codeatcg/VRPG/HEAD/static/images/arrowRight3.png -------------------------------------------------------------------------------- /static/images/ref_collapse3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codeatcg/VRPG/HEAD/static/images/ref_collapse3.png -------------------------------------------------------------------------------- /vrpg/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class VrpgConfig(AppConfig): 5 | default_auto_field = 'django.db.models.BigAutoField' 6 | name = 'vrpg' 7 | -------------------------------------------------------------------------------- /test/mc.genome.sh: -------------------------------------------------------------------------------- 1 | dataDir=`pwd` 2 | echo "SGDref $dataDir/GCF_000146045.2_R64_genomic.fna.gz" >> mc.genome.txt 3 | echo "DBVPG6765 $dataDir/DBVPG6765.genome.fa.gz" >> mc.genome.txt 4 | echo "Y12 $dataDir/Y12.genome.fa.gz" >> mc.genome.txt 5 | echo "SK1 $dataDir/SK1.genome.fa.gz" >> mc.genome.txt 6 | echo "DBVPG6044 $dataDir/DBVPG6044.genome.fa.gz" >> mc.genome.txt 7 | -------------------------------------------------------------------------------- /test/mc.gff.sh: -------------------------------------------------------------------------------- 1 | dataDir=`pwd` 2 | echo "SGDref#0 $dataDir/GCF_000146045.2_R64_genomic.gff.gz NA" >> mc.gff.txt 3 | echo "DBVPG6765#0 $dataDir/DBVPG6765.all_feature.gff.gz NA" >> mc.gff.txt 4 | echo "Y12#0 $dataDir/Y12.all_feature.gff.gz NA" >> mc.gff.txt 5 | echo "SK1#0 $dataDir/SK1.all_feature.gff.gz NA" >> mc.gff.txt 6 | echo "DBVPG6044#0 $dataDir/DBVPG6044.all_feature.gff.gz NA" >> mc.gff.txt 7 | 8 | -------------------------------------------------------------------------------- /test/pggb.gff.sh: -------------------------------------------------------------------------------- 1 | dataDir=`pwd` 2 | echo "SGDref#1 $dataDir/GCF_000146045.2_R64_genomic.gff.gz NA" >> pggb.gff.txt 3 | echo "DBVPG6765#1 $dataDir/DBVPG6765.all_feature.gff.gz NA" >> pggb.gff.txt 4 | echo "Y12#1 $dataDir/Y12.all_feature.gff.gz NA" >> pggb.gff.txt 5 | echo "SK1#1 $dataDir/SK1.all_feature.gff.gz NA" >> pggb.gff.txt 6 | echo "DBVPG6044#1 $dataDir/DBVPG6044.all_feature.gff.gz NA" >> pggb.gff.txt 7 | 8 | -------------------------------------------------------------------------------- /test/prepare.gff.sh: -------------------------------------------------------------------------------- 1 | dataDir=`pwd` 2 | echo "SGDref#HP0 $dataDir/GCF_000146045.2_R64_genomic.gff.gz NA" >> build.gff.txt 3 | echo "DBVPG6765#HP0 $dataDir/DBVPG6765.all_feature.gff.gz NA" >> build.gff.txt 4 | echo "Y12#HP0 $dataDir/Y12.all_feature.gff.gz NA" >> build.gff.txt 5 | echo "SK1#HP0 $dataDir/SK1.all_feature.gff.gz NA" >> build.gff.txt 6 | echo "DBVPG6044#HP0 $dataDir/DBVPG6044.all_feature.gff.gz NA" >> build.gff.txt 7 | 8 | -------------------------------------------------------------------------------- /test/pggb.genome.sh: -------------------------------------------------------------------------------- 1 | mkdir pggb_genome 2 | zcat GCF_000146045.2_R64_genomic.fna.gz > pggb_genome/SGDref.fa 3 | fastix -p "SGDref#1#" pggb_genome/SGDref.fa >> pggb_genome/all.fastix.fa 4 | for i in DBVPG6765 Y12 SK1 DBVPG6044 5 | do 6 | zcat $i.genome.fa.gz > pggb_genome/$i.genome.fa 7 | fastix -p "${i}#1#" pggb_genome/$i.genome.fa >> pggb_genome/all.fastix.fa 8 | done 9 | 10 | samtools faidx pggb_genome/all.fastix.fa 11 | 12 | -------------------------------------------------------------------------------- /test/prepare.asm.sh: -------------------------------------------------------------------------------- 1 | # primary reference -- GCF_000146045.2_R64_genomic.fna.gz 2 | dataDir=`pwd` 3 | echo "SGDref#HP0 $dataDir/GCF_000146045.2_R64_genomic.fna.gz" >> build.asm.txt 4 | echo "DBVPG6765#HP0 $dataDir/DBVPG6765.genome.fa.gz" >> build.asm.txt 5 | echo "Y12#HP0 $dataDir/Y12.genome.fa.gz" >> build.asm.txt 6 | echo "SK1#HP0 $dataDir/SK1.genome.fa.gz" >> build.asm.txt 7 | echo "DBVPG6044#HP0 $dataDir/DBVPG6044.genome.fa.gz" >> build.asm.txt 8 | -------------------------------------------------------------------------------- /vrpg_project/asgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | ASGI config for vrpg_project project. 3 | 4 | It exposes the ASGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/3.2/howto/deployment/asgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.asgi import get_asgi_application 13 | 14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'vrpg_project.settings') 15 | 16 | application = get_asgi_application() 17 | -------------------------------------------------------------------------------- /vrpg_project/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for vrpg_project project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/3.2/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'vrpg_project.settings') 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /module/gz.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | class iogzbuf : public std::streambuf{ 10 | gzFile iogz; 11 | static const int imem = 512; 12 | char buff[imem]; 13 | public: 14 | iogzbuf(){ 15 | setg(buff+4,buff+4,buff+4); 16 | } 17 | iogzbuf* open(const char *inFile,const char *mode); 18 | void close(); 19 | 20 | virtual int_type underflow(); 21 | }; 22 | 23 | class igzstream : public std::istream{ 24 | iogzbuf buff; 25 | public: 26 | igzstream(const char *inFile); 27 | igzstream* open(const char *inFile); 28 | void close(); 29 | }; 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Django's command-line utility for administrative tasks.""" 3 | import os 4 | import sys 5 | 6 | 7 | def main(): 8 | """Run administrative tasks.""" 9 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'vrpg_project.settings') 10 | try: 11 | from django.core.management import execute_from_command_line 12 | except ImportError as exc: 13 | raise ImportError( 14 | "Couldn't import Django. Are you sure it's installed and " 15 | "available on your PYTHONPATH environment variable? Did you " 16 | "forget to activate a virtual environment?" 17 | ) from exc 18 | execute_from_command_line(sys.argv) 19 | 20 | 21 | if __name__ == '__main__': 22 | main() 23 | -------------------------------------------------------------------------------- /templates/vrpg/local.head: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /create.local.py: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/env python 3 | import re 4 | 5 | headFile = "templates/vrpg/local.head" 6 | indexFile = "templates/vrpg/index.html" 7 | localFile = "templates/vrpg/local.index.html" 8 | 9 | headStr = "" 10 | pat1 = re.compile('\s*') 11 | pat2 = re.compile('\s*<\/head>') 12 | with open(headFile) as fh: 13 | headStr = fh.read() 14 | 15 | with open(indexFile) as dxh,open(localFile,'w') as wfh: 16 | flag1 = False 17 | flag2 = False 18 | for line in dxh: 19 | if not flag1: 20 | wfh.write(line) 21 | if pat1.search(line): 22 | wfh.write(headStr) 23 | flag1 = True 24 | else: 25 | if flag2: 26 | wfh.write(line) 27 | else: 28 | if pat2.search(line): 29 | wfh.write(line) 30 | flag2 = True 31 | 32 | 33 | -------------------------------------------------------------------------------- /module/makefile: -------------------------------------------------------------------------------- 1 | 2 | CXX = g++ 3 | CPPFLAG = -D PYMODULE 4 | PYSO = minipg$(shell python3-config --extension-suffix) 5 | INCLUDE = $(shell python3 -m pybind11 --includes) 6 | CXXFLAG = -O3 -Wall -std=c++11 -pthread 7 | LIBS = -lz 8 | 9 | OBJ1 = gfa2view.o minipg.o gz.o 10 | 11 | OBJ2 = anno.o refgene.o refbed.o ghAnno.o gz.o 12 | OBJ3 = gaf2rbed.o 13 | OBJ4 = nodeSeq.o gz.o 14 | 15 | all : gfa2view GraphAnno gaf2rbed nodeSeq $(PYSO) 16 | 17 | gfa2view : $(OBJ1) 18 | $(CXX) $(CXXFLAG) $(OBJ1) -o $@ $(LIBS) 19 | 20 | GraphAnno : $(OBJ2) 21 | $(CXX) $(CXXFLAG) $(OBJ2) -o $@ $(LIBS) 22 | 23 | gaf2rbed : $(OBJ3) 24 | $(CXX) $(CXXFLAG) $(OBJ3) -o $@ 25 | 26 | nodeSeq : $(OBJ4) 27 | $(CXX) $(CXXFLAG) $(OBJ4) -o $@ $(LIBS) 28 | 29 | %.o : %.cpp 30 | $(CXX) $(CXXFLAG) -c $< -o $@ $(LIBS) 31 | 32 | $(PYSO) : minipg.cpp 33 | $(CXX) $(CPPFLAG) $(CXXFLAG) -shared -fPIC $(INCLUDE) minipg.cpp -o $(PYSO) 34 | 35 | .PHONY : clean 36 | 37 | clean: 38 | rm *.o 39 | 40 | -------------------------------------------------------------------------------- /test/download.testData.sh: -------------------------------------------------------------------------------- 1 | wget -c https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.fna.gz 2 | wget -c https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.gff.gz 3 | 4 | wget -c http://yjx1217.github.io/Yeast_PacBio_2016/data/Nuclear_Genome/SK1.genome.fa.gz 5 | wget -c http://yjx1217.github.io/Yeast_PacBio_2016/data/Nuclear_GFF/SK1.all_feature.gff.gz 6 | 7 | wget -c http://yjx1217.github.io/Yeast_PacBio_2016/data/Nuclear_Genome/Y12.genome.fa.gz 8 | wget -c http://yjx1217.github.io/Yeast_PacBio_2016/data/Nuclear_GFF/Y12.all_feature.gff.gz 9 | 10 | wget -c http://yjx1217.github.io/Yeast_PacBio_2016/data/Nuclear_Genome/DBVPG6765.genome.fa.gz 11 | wget -c http://yjx1217.github.io/Yeast_PacBio_2016/data/Nuclear_GFF/DBVPG6765.all_feature.gff.gz 12 | 13 | wget -c http://yjx1217.github.io/Yeast_PacBio_2016/data/Nuclear_Genome/DBVPG6044.genome.fa.gz 14 | wget -c http://yjx1217.github.io/Yeast_PacBio_2016/data/Nuclear_GFF/DBVPG6044.all_feature.gff.gz 15 | 16 | -------------------------------------------------------------------------------- /vrpg_project/urls.py: -------------------------------------------------------------------------------- 1 | """vrpg_project URL Configuration 2 | 3 | The `urlpatterns` list routes URLs to views. For more information please see: 4 | https://docs.djangoproject.com/en/3.2/topics/http/urls/ 5 | Examples: 6 | Function views 7 | 1. Add an import: from my_app import views 8 | 2. Add a URL to urlpatterns: path('', views.home, name='home') 9 | Class-based views 10 | 1. Add an import: from other_app.views import Home 11 | 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') 12 | Including another URLconf 13 | 1. Import the include() function: from django.urls import include, path 14 | 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) 15 | """ 16 | from django.contrib import admin 17 | from django.urls import path 18 | from vrpg import views 19 | 20 | urlpatterns = [ 21 | path('admin/', admin.site.urls), 22 | path('app/vrpg/',views.index), 23 | path('app/vrpg/init/',views.initGraph), 24 | path('app/vrpg/move/',views.showGraph), 25 | path('app/vrpg/seq2map/',views.seqQuery), 26 | path('app/vrpg/nodeInfo/',views.searchNode), 27 | ] 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Zepu Miao 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /module/gz.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "gz.h" 3 | 4 | using namespace std; 5 | 6 | iogzbuf* iogzbuf::open(const char *inFile,const char *mode){ 7 | iogz = gzopen(inFile,mode); 8 | if(iogz == NULL){ 9 | return (iogzbuf *)0; 10 | }else{ 11 | return this; 12 | } 13 | } 14 | 15 | void iogzbuf::close(){ 16 | gzclose(iogz); 17 | } 18 | 19 | streambuf::int_type iogzbuf::underflow(){ 20 | if(gptr() < egptr()){ 21 | return traits_type::to_int_type(*gptr()); 22 | } 23 | // 24 | int num = gptr() - eback(); 25 | if(num > 4){ 26 | num = 4; 27 | } 28 | memmove(buff+(4-num),gptr()-num,num); 29 | int rNum = gzread(iogz,buff+4,imem-4); 30 | if(rNum <= 0){ 31 | return EOF; 32 | } 33 | setg(buff+(4-num),buff+4,buff+4+rNum); 34 | return traits_type::to_int_type(*gptr()); 35 | } 36 | 37 | igzstream::igzstream(const char *inFile):istream(0){ 38 | open(inFile); 39 | } 40 | 41 | igzstream* igzstream::open(const char *inFile){ 42 | if(buff.open(inFile,"rb")){ 43 | rdbuf(&buff); 44 | return this; 45 | }else{ 46 | return (igzstream *)0; 47 | } 48 | } 49 | 50 | void igzstream::close(){ 51 | buff.close(); 52 | } 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /module/ghAnno.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | int addRef_main(int argc,char **argv); 5 | int addBed_main(int argc,char **argv); 6 | int ndg_main(int argc,char **argv); 7 | 8 | void usage(){ 9 | std::cout<<"Usage: GraphAnno command options"< 3 | #include "vgraph.h" 4 | #include "gz.h" 5 | 6 | using namespace std; 7 | 8 | typedef struct{ 9 | int neoID; 10 | char ori; 11 | } Jnode; 12 | 13 | inline int parseAlign(std::string &align){ 14 | if(align == "*"){ 15 | return 0; 16 | } 17 | 18 | int value = 0; 19 | int tvalue = 0; 20 | for(char x : align){ 21 | if(x >= '0' && x <= '9'){ 22 | tvalue = tvalue * 10 + (x - '0'); 23 | }else{ 24 | if(x == 'M' || x == 'I' || x == 'S' || x == 'X' || x == '='){ 25 | value += tvalue; 26 | } 27 | tvalue = 0; 28 | } 29 | } 30 | return value; 31 | } 32 | 33 | inline void markSign(char mark,bool flip1,bool flip2,char &sign1,char &sign2){ 34 | switch(mark){ 35 | case '2': 36 | sign1 = '+'; 37 | sign2 = '+'; 38 | break; 39 | case '3': 40 | sign1 = '+'; 41 | sign2 = '-'; 42 | break; 43 | case '4': 44 | sign1 = '-'; 45 | sign2 = '+'; 46 | break; 47 | case '5': 48 | sign1 = '-'; 49 | sign2 = '-'; 50 | } 51 | if(flip1){ 52 | if(sign1 == '+'){ 53 | sign1 = '-'; 54 | }else{ 55 | sign1 = '+'; 56 | } 57 | } 58 | if(flip2){ 59 | if(sign2 == '+'){ 60 | sign2 = '-'; 61 | }else{ 62 | sign2 = '+'; 63 | } 64 | } 65 | 66 | } 67 | 68 | int readGFA(std::string &tmpFolder,igzstream &in,std::string &refStr,std::string &sep,std::unordered_map &mNodeLen,std::map &edgeMap,std::map &jumpMap,std::ofstream &afh,std::ofstream &acfh); 69 | 70 | int psRchrWalk(std::string &refPath,std::string &fullName,int refStart,std::unordered_map &mNodeLen,int &neoID,std::unordered_set &refNodeSet, 71 | std::unordered_set &flipSet,std::unordered_map &rCovMap,std::ofstream &nfh,std::ofstream &efh,std::ofstream &pfh,ofstream &ddfh); 72 | 73 | void psNWalk(std::string &nrPath,std::string &asmb,int walkStart,std::unordered_set &flipSet,std::unordered_map &tmap,std::unordered_map &mNodeLen,std::unordered_set &noutNode,std::ofstream &pfh,std::ofstream &nfh); 74 | 75 | // 76 | 77 | int psRchrPath(std::string &refPath,std::string &fullName,std::unordered_map &mNodeLen,std::map &edgeMap,std::map &jumpMap,int &neoID,std::unordered_set &refNodeSet, 78 | std::unordered_set &flipSet,std::set &rEset,std::set &rJset,std::map &jNeoMap,std::unordered_map &rCovMap,std::ofstream &nfh,std::ofstream &efh,std::ofstream &pfh,ofstream &ddfh 79 | ); 80 | 81 | void psNPath(std::string &nrPath,std::string &asmb,std::unordered_set &flipSet,std::map &jNeoMap,std::unordered_map &tmap,std::unordered_map &mNodeLen,std::unordered_set &noutSet,std::ofstream &pfh,std::ofstream &nfh); 82 | 83 | void psAllPath(bool ncalCov,std::string &tmpFolder,std::string &asmFile,std::string &sepStr,int &neoID,std::unordered_map &mNodeLen,std::map &edgeMap,std::map &jumpMap, 84 | std::ofstream &nfh,std::ofstream &efh,std::ofstream &covfh,std::ofstream &xcovfh,std::ofstream &cfh,std::string &pathDir,std::string &flipFile 85 | ); 86 | 87 | void gfa2view(bool ncalCov,char *rfChrFile,char *gfaFile,char *refName,char *sep,int range,int ex,bool index,int nocross,int nthread,int storeDep,char *outDir); 88 | void g2v_usage(); 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /static/images/question.svg: -------------------------------------------------------------------------------- 1 | 2 | 17 | 19 | 42 | 44 | 45 | 47 | image/svg+xml 48 | 50 | 51 | 52 | 53 | 54 | 59 | 61 | 65 | 69 | 74 | 78 | 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /vrpg_project/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for vrpg_project project. 3 | 4 | Generated by 'django-admin startproject' using Django 3.2.4. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/3.2/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/3.2/ref/settings/ 11 | """ 12 | 13 | from pathlib import Path 14 | import os 15 | 16 | # Build paths inside the project like this: BASE_DIR / 'subdir'. 17 | BASE_DIR = Path(__file__).resolve().parent.parent 18 | TEMPLATE_DIR = os.path.join(BASE_DIR,'templates') 19 | STATIC_DIR = os.path.join(BASE_DIR,'static') 20 | STATICFILES_DIRS = [STATIC_DIR,] 21 | 22 | 23 | # Quick-start development settings - unsuitable for production 24 | # See https://docs.djangoproject.com/en/3.2/howto/deployment/checklist/ 25 | 26 | # SECURITY WARNING: keep the secret key used in production secret! 27 | SECRET_KEY = 'django-insecure-)%gec_t4p0&t$6+66s6l=k@79%*q4f5hucf3%biu^q22h0vzro' 28 | 29 | # SECURITY WARNING: don't run with debug turned on in production! 30 | DEBUG = True 31 | 32 | ALLOWED_HOSTS = ['*'] 33 | 34 | 35 | # Application definition 36 | 37 | INSTALLED_APPS = [ 38 | 'django.contrib.admin', 39 | 'django.contrib.auth', 40 | 'django.contrib.contenttypes', 41 | 'django.contrib.sessions', 42 | 'django.contrib.messages', 43 | 'django.contrib.staticfiles', 44 | 'vrpg', 45 | ] 46 | 47 | MIDDLEWARE = [ 48 | 'django.middleware.security.SecurityMiddleware', 49 | 'django.contrib.sessions.middleware.SessionMiddleware', 50 | 'django.middleware.common.CommonMiddleware', 51 | 'django.middleware.csrf.CsrfViewMiddleware', 52 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 53 | 'django.contrib.messages.middleware.MessageMiddleware', 54 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 55 | ] 56 | 57 | ROOT_URLCONF = 'vrpg_project.urls' 58 | 59 | TEMPLATES = [ 60 | { 61 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 62 | 'DIRS': [TEMPLATE_DIR,], 63 | 'APP_DIRS': True, 64 | 'OPTIONS': { 65 | 'context_processors': [ 66 | 'django.template.context_processors.debug', 67 | 'django.template.context_processors.request', 68 | 'django.contrib.auth.context_processors.auth', 69 | 'django.contrib.messages.context_processors.messages', 70 | ], 71 | }, 72 | }, 73 | ] 74 | 75 | WSGI_APPLICATION = 'vrpg_project.wsgi.application' 76 | 77 | 78 | # Database 79 | # https://docs.djangoproject.com/en/3.2/ref/settings/#databases 80 | 81 | DATABASES = { 82 | 'default': { 83 | 'ENGINE': 'django.db.backends.sqlite3', 84 | 'NAME': BASE_DIR / 'db.sqlite3', 85 | } 86 | } 87 | 88 | 89 | # Password validation 90 | # https://docs.djangoproject.com/en/3.2/ref/settings/#auth-password-validators 91 | 92 | AUTH_PASSWORD_VALIDATORS = [ 93 | { 94 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', 95 | }, 96 | { 97 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', 98 | }, 99 | { 100 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', 101 | }, 102 | { 103 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', 104 | }, 105 | ] 106 | 107 | 108 | # Internationalization 109 | # https://docs.djangoproject.com/en/3.2/topics/i18n/ 110 | 111 | LANGUAGE_CODE = 'en-us' 112 | 113 | TIME_ZONE = 'UTC' 114 | 115 | USE_I18N = True 116 | 117 | USE_L10N = True 118 | 119 | USE_TZ = True 120 | 121 | 122 | # Static files (CSS, JavaScript, Images) 123 | # https://docs.djangoproject.com/en/3.2/howto/static-files/ 124 | 125 | STATIC_URL = '/static/' 126 | 127 | # Default primary key field type 128 | # https://docs.djangoproject.com/en/3.2/ref/settings/#default-auto-field 129 | 130 | DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' 131 | -------------------------------------------------------------------------------- /module/nodeSeq.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | //#include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "gz.h" 12 | 13 | using namespace std; 14 | 15 | typedef struct{ 16 | long long offset; 17 | int len; 18 | } SeqDx; 19 | 20 | string revComSeq(string &seq,map &trans){ 21 | string rvstr = ""; 22 | int n = seq.size() - 1; 23 | for(int pos = n; pos >= 0; pos--){ 24 | if(trans.find(seq[pos]) == trans.end()){ 25 | cerr<<"Warning: invalid character in the sequence. "< &trans){ 35 | trans.emplace('A','T'); 36 | trans.emplace('T','A'); 37 | trans.emplace('C','G'); 38 | trans.emplace('G','C'); 39 | trans.emplace('N','N'); 40 | trans.emplace('a','t'); 41 | trans.emplace('t','a'); 42 | trans.emplace('c','g'); 43 | trans.emplace('g','c'); 44 | trans.emplace('n','n'); 45 | } 46 | 47 | void node2seq(char *gfaFile,string &upDir){ 48 | unordered_set flipSet; 49 | int intSize = sizeof(int); 50 | string flipFile = upDir + "/flip.bw"; 51 | ifstream in(flipFile.c_str()); 52 | if(in){ 53 | int num,node; 54 | in.read((char *)&num,intSize); 55 | for(int i = 0; i < num; ++i){ 56 | in.read((char *)&node,intSize); 57 | flipSet.insert(node); 58 | } 59 | in.close(); 60 | } 61 | igzstream gfh(gfaFile); 62 | if(! gfh){ 63 | cerr<<"Error: file open failed. "< trans; 93 | makeTrans(trans); 94 | while(getline(gfh,line)){ 95 | if(line[0] == 'S'){ 96 | strStream << line; 97 | strStream >> tag >> snode >> seq; 98 | strStream.clear(); 99 | strStream.str(""); 100 | // 101 | int tnode; 102 | if(snode[0] == 's'){ 103 | string msNode = snode.substr(1); 104 | ofh << ">" << msNode << endl; 105 | tnode = atoi(msNode.c_str()); 106 | tgLen = snode.size() - 1; 107 | }else{ 108 | ofh << ">" << snode << endl; 109 | tnode = atoi(snode.c_str()); 110 | tgLen = snode.size(); 111 | } 112 | if(flipSet.empty()){ 113 | ofh << seq <> tag >> sceNode >> nwNode; 139 | strStream.clear(); 140 | strStream.str(""); 141 | if(tag == "S"){ 142 | sdx[nwNode-1] = sdx[sceNode - 1]; 143 | }else{ 144 | SeqDx tx = {0,sceNode}; 145 | sdx[nwNode-1] = tx; 146 | } 147 | 148 | } 149 | 150 | adfh.close(); 151 | } 152 | // 153 | ofstream xfh(dxFile.c_str()); 154 | if(! xfh){ 155 | cerr<<"Error: file open failed. "< input GFA or rGFA file."< 'upload' directory which including files generated by 'gfa2view' or 'vrpg_preprocess.py'."<> mapping/gaf.list 76 | echo "Y12#HP0 mapping/Y12.unstable.gaf NA" >> mapping/gaf.list 77 | echo "SK1#HP0 mapping/SK1.unstable.gaf NA" >> mapping/gaf.list 78 | echo "DBVPG6044#HP0 mapping/DBVPG6044.unstable.gaf NA" >> mapping/gaf.list 79 | ``` 80 | 81 | 3. Build index files for VRPG 82 | 83 | ``` 84 | python ../script/vrpg_preprocess.py --gafList mapping/gaf.list --rGFA results/upload/input.ref.gfa --outDir rGFA_upload --thread 5 --index --xDep 100 85 | ``` 86 | 87 | 4. Extract and index node sequences 88 | ``` 89 | ../module/nodeSeq --gfaFile results/upload/input.ref.gfa --upDir rGFA_upload 90 | ``` 91 | 92 | 5. Add the gene annotation track for the primary linear reference genome 93 | ``` 94 | ../module/GraphAnno addRef --inGFF GCF_000146045.2_R64_genomic.gff.gz --upDir rGFA_upload 95 | ``` 96 | 97 | 6. Overlay gene annotation for all nodes 98 | 99 | ``` 100 | # If build.gff.txt has not been created, type 101 | sh prepare.gff.sh 102 | 103 | ../module/GraphAnno nodeGene --gffList build.gff.txt --upDir rGFA_upload 104 | ``` 105 | 106 | 7. Add additional annotation tracks from BED files 107 | 108 | ``` 109 | ../module/GraphAnno addBed --inBed test.track.bed --upDir rGFA_upload 110 | ``` 111 | 112 | 8. Move the prepared data to VRPG's upload directory for rendering 113 | 114 | ``` 115 | # If a default graph has been determined (see step 7 when only having assemblies but not a pre-existing pangenome graph), this graph will be the additional graph to display. 116 | mv rGFA_upload/upload ../upload/rGFA_graph 117 | ``` 118 | 119 | 9. Start the Django development server 120 | 121 | ``` 122 | python3 manage.py runserver 123 | ``` 124 | 125 | 10. Access the visualized pangenome graph in VRPG 126 | 127 | Visit the following address in the web browser: 128 | http://127.0.0.1:8000/app/vrpg/ 129 | 130 | 131 | 132 | # For GFA-formatted pangenome graphs 133 | ## For GFA-formatted graphs generated by Minigraph-Cactus 134 | 1. Construct a pangenome graph with Minitraph-Cactus 135 | 136 | **Note**: The parameters and options used below are for this testing example only, which may not work best for all cases. Please refer to [this tutorial](https://github.com/ComparativeGenomicsToolkit/cactus/blob/master/doc/pangenome.md) for a more detailed instruction on building pangenome graphs with Minigraph-Cacuts. 137 | 138 | ``` 139 | # Assume that Minitraph-Cactus has been installed 140 | sh mc.genome.sh 141 | cactus-pangenome ./js mc.genome.txt --outDir mc --outName mc --reference SGDref --gfa 142 | 143 | ``` 144 | 145 | 2. Format conversion and index building 146 | 147 | ``` 148 | ../module/gfa2view --GFA mc/mc.gfa.gz --ref SGDref#0 --outDir mc_upload --index --range 2000 --thread 5 --xDep 100 149 | 150 | ``` 151 | 152 | 3. Extract node sequences and add annotation tracks 153 | 154 | ``` 155 | sh mc.gff.sh 156 | 157 | ../module/nodeSeq --gfaFile mc/mc.gfa.gz --upDir mc_upload/upload 158 | ../module/GraphAnno addRef --inGFF GCF_000146045.2_R64_genomic.gff.gz --upDir mc_upload/upload 159 | ../module/GraphAnno nodeGene --gffList mc.gff.txt --upDir mc_upload/upload 160 | ../module/GraphAnno addBed --inBed test.track.bed --upDir mc_upload/upload 161 | 162 | ``` 163 | 164 | 4. Move the prepared data to the upload directory of VRPG for rendering 165 | 166 | ``` 167 | mv mc_upload/upload ../upload/mc_graph 168 | ``` 169 | 170 | 5. Start the Django development server 171 | 172 | ``` 173 | python3 manage.py runserver 174 | ``` 175 | 176 | 6. Access the visualized pangenome graph in VRPG 177 | 178 | Visit the following address in the web browser: 179 | http://127.0.0.1:8000/app/vrpg/ 180 | 181 | 182 | 183 | ## For GFA-formatted graphs generated by PGGB 184 | 185 | 1. Construct a pangenome graph with PGGB 186 | 187 | **Note**: The parameters and options used below are for this testing example only, which may not work best for all cases. Please refer to [this tutorial](https://github.com/pangenome/pggb) for a more detailed instruction on building pangenome graphs with Minigraph-Cacuts. 188 | 189 | ``` 190 | # Assume that fastix and pggb have been installed 191 | sh pggb.genome.sh 192 | pggb -i pggb_genome/all.fastix.fa -t 10 -p 95 -n 5 -k 23 -o pggb 193 | 194 | ``` 195 | 2. Format conversion and index building 196 | 197 | ``` 198 | # Please replace 'all.fastix.fa.*.final.gfa' with the actual file name 199 | ../module/gfa2view --GFA pggb/all.fastix.fa.*.final.gfa --ref SGDref#1 --outDir pggb_upload --index --range 2000 --thread 5 --xDep 100 200 | 201 | ``` 202 | 203 | 3. Extract node sequences and add annotation tracks 204 | 205 | ``` 206 | sh pggb.gff.sh 207 | 208 | # Please replace 'all.fastix.fa.*.final.gfa' with the actual file name 209 | ../module/nodeSeq --gfaFile pggb/all.fastix.fa.*.final.gfa --upDir pggb_upload/upload 210 | ../module/GraphAnno addRef --inGFF GCF_000146045.2_R64_genomic.gff.gz --upDir pggb_upload/upload 211 | ../module/GraphAnno nodeGene --gffList pggb.gff.txt --upDir pggb_upload/upload 212 | ../module/GraphAnno addBed --inBed test.track.bed --upDir pggb_upload/upload 213 | ``` 214 | 215 | 4. Move the prepared data to the upload directory of VRPG for rendering 216 | 217 | ``` 218 | mv pggb_upload/upload ../upload/pggb_graph 219 | ``` 220 | 221 | 222 | 5. Start the Django development server 223 | 224 | ``` 225 | python3 manage.py runserver 226 | ``` 227 | 228 | 6. Access the visualized pangenome graph in VRPG 229 | 230 | Visit the following address in the web browser: 231 | http://127.0.0.1:8000/app/vrpg/ 232 | 233 | 234 | 235 | 236 | 237 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VRPG 2 | An interactive visualization and interpretation framework of reference-projected pangenome graphs 3 | 4 | | | | 5 | | --- | --- | 6 | | | | 7 | 8 | 9 | # Description 10 | 11 | VRPG is a web-based interactive Visualization and interpretation framework for linear-Reference-projected Pangenome Graphs. VRPG provides efficient and intuitive supports for exploring and annotating pangenome graphs along a linear-genome-based coordinate system (e.g., that of a primary linear reference genome). Moreover, VRPG offers many unique features such as in-graph path highlighting for graph-constituent input assemblies, copy number characterization for graph-embedding nodes, graph-based mapping for query sequences, all of which are highly valuable for researchers working with pangenome graphs. Additionally, VRPG enables side-by-side visualization between the graph-based pangenome representation and the conventional primary-linear-reference-genome-based feature annotations, therefore seamlessly bridging the graph and linear genomic contexts. 12 | 13 | Pangenome graphs encoded in both rGFA and GFAv1 formats are supported. 14 | 15 | To further demonstrate its functionality and scalability, we provide a demonstration website for VRPG at https://evomicslab.org/app/vrpg/ to showcase its capacity and functionality with the cutting-edge yeast and human reference pangenome graphs derived from hundreds of high-quality genome assemblies. 16 | 17 | For more details regarding VRPG's functionalities and usage, please check out the documentation at https://evomicslab.org/app/vrpg/manual/. 18 | 19 | 20 | # Citation 21 | 22 | Zepu Miao, Jia-Xing Yue*. (2025) Interactive visualization and interpretation of pangenome graphs by linear-reference-based coordinate projection and annotation integration. Genome Research, 35(2):296-310. (doi: 10.1101/gr.279461.124; demonstration available at https://www.evomicslab.org/app/vrpg/; software available at https://github.com/codeatcg/VRPG) [[LINK](https://genome.cshlp.org/content/35/2/296.full)] 23 | 24 | # Installation 25 | Dependencies: 26 | `git`, `wget`, `bash`, `gcc/g++ >= 4.9`, `make`, `python3 (>=3.6)`, `pip`, `zlib`, `zlib-devel` 27 | 28 | ``` 29 | 30 | # To install a historical version of VRPG, please access https://github.com/codeatcg/VRPG/releases and download the source code. 31 | # To install the latest version of VRPG, please type 32 | pip install Django==3.2.4 pybind11 33 | git clone https://github.com/codeatcg/VRPG --recursive 34 | cd VRPG/module 35 | make 36 | 37 | # By default, the javascript packages that VRPG depends on are loaded from CDN. Users can optionally host these packages locally by typing: 38 | python create.local.py 39 | sh host.jslib.local.sh local 40 | # In case that users want to switch back to loading packages from CDN, type: 41 | sh host.jslib.local.sh cdn 42 | 43 | # To install the dependence (minigraph) for sequence-to-graph mapping, type: 44 | cd VRPG 45 | mkdir bin 46 | cd bin 47 | wget -c https://github.com/lh3/minigraph/releases/download/v0.20/minigraph-0.20_x64-linux.tar.bz2 48 | tar -jxf minigraph-0.20_x64-linux.tar.bz2 49 | mv minigraph-0.20_x64-linux/minigraph ./ 50 | 51 | ``` 52 | 53 | # Input data preparation 54 | 55 | The naming scheme of assembly should follow PanSN prefix naming specification. Briefly, the assembly's name consists of sample name, delimiter, and haplotype name, e.g., sampleA#0. The haplotype name can be numeric (e.g. '0', '1', or '2') or characters (e.g., 'collapsed' or 'phased') or both (e.g., 'h1' or 'h2'). 56 | 57 | When indexing the pangenome graphs, users can set the hard limit for maximal search depth via the option ‘--xDep’ (for VRPG versions >=0.1.3). The default value for this option is 100. Setting this value too small may cause incomplete graph traversing for large bubbles. 58 | 59 | ## For rGFA-formatted pangenome graphs 60 | 61 | ### When already having the rGFA-formatted graph file 62 | 63 | Just type the following command to feed the graph file to VRPG: 64 | 65 | ``` 66 | Python ./script/vrpg_preprocess.py --rGFA all.gfa --outDir out_folder --index --xDep 100 67 | ``` 68 | 69 | That said, if users want to use VRPG's assembly-to-path highlighting function, additional assembly-to-graph mapping files (in the GAF format) are needed, which can be generated by minigraph using the command '-cxasm --vc' (see [TEST_README.md](https://github.com/codeatcg/VRPG/blob/main/test/TEST_README.md) for more details). Once this is done, prepare a tab-delimited gaf_file.list file formatted as follows: 70 | 71 | ``` 72 | sample1#H1 sample1.H1.gaf 73 | sample2#0 sample2.0.gaf 74 | sample3#1 sample3.1.gaf 75 | sample3#2 sample3.2.gaf 76 | ``` 77 | 78 | Then run the following command to prepare all the input information for VRPG. 79 | 80 | ``` 81 | Python ./script/vrpg_preprocess.py --rGFA all.gfa --gafList gaf_file.list --outDir out_folder --index --xDep 100 82 | ``` 83 | 84 | 85 | ### When not having the rGFA-formatted graph file 86 | 87 | First prepare a tab-delimited asm_file.list file formatted as follows: 88 | 89 | ``` 90 | sample1#H1 sample1.H1.fa 91 | sample2#0 sample2.0.fa 92 | sample3#1 sample3.1.fa 93 | sample3#2 sample3.2.fa 94 | ``` 95 | **Note**: The assembly defined on the first line in this file will be used as the primary linear reference genome for rGFA graph construction. 96 | 97 | Then run the following command to create the rGFA-formatted pangenome graph and to generate all needed files for VRPG. 98 | 99 | ``` 100 | Python ./script/vrpg_preprocess.py –-minigraph '/software/minigraph' --asmList asm_file.list –-outDir out_folder --index --xDep 100 101 | ``` 102 | 103 | **Note**: Here '/software/minigraph' represents the absolute path of the minigraph executable file. 104 | 105 | ## For GFAv1-formatted pangenome graphs 106 | 107 | For graphs encoded in GFAv1 format, VRPG requires its node (i.e., segment) name to be numeric. Graphs generated by Minigraph-Cactus and PGGB both satisfy this requirement. When dealing with a pangenome graph file of which the node/segment names are not numeric, users need to modify the graph first. Also notice that all path names defined in the graph should follow PanSN prefix naming specification. This requirement can be met by using proper assembly names before constructing the graph. Once these checks are all passed, run the following command to prepare the graph for VRPG: 108 | 109 | 110 | ``` 111 | ./module/gfa2view --GFA in.gfa --ref refName --outDir output_dir --index --xDep 100 --range 2000 --thread 10 112 | ``` 113 | 114 | **Note**: The gfa2view step can be further divided into two steps when needed. This can be useful for testing different indexing parameters, but note that the index files generated in the new run will automatically overwrite the old ones. To run the gfa2view step in two steps, type: 115 | ``` 116 | # Step 1: format conversion and assembly-to-graph mapping depth calculation. This step cannot be parallelized. 117 | 118 | ./module/gfa2view --GFA in.gfa --ref refName --outDir output_dir 119 | 120 | # Step 2: graph indexing. This step can be parallelized. 121 | 122 | ./module/gfa2view --outDir output_dir --index --xDep 100 --range 2000 --thread 10 123 | 124 | ``` 125 | 126 | **Note**: For now, the memory consumption of 'gfa2view' is proportional to number of threads. A trade-off between speed and and memory consumption needs to be considered. 127 | 128 | **Note**: For a primary linear reference genome with many small contigs (that will not be used in graph visualization), it is recommended to specify the '--refChr' option for 'gfa2view' to let VRPG only consider major chromosomes/contigs for graph indexing. This will help to substantially reduce the graph indexing time. For this option, a text file containing the names of the specified chromosomes/contigs (one line per chromosome/contig name) is needed. 129 | 130 | 131 | ## Annotation tracks 132 | 133 | A unique feature of VRPG lies in its native support for a side-by-side visualization between the pangenome graph and multiple annotation tracks based on the same primary linear reference coordinate system. For now, VRPG accepts annotation tracks defined in GFF3 and BED formats. 134 | 135 | For the gene annotation track defined in GFF3 format, the maximal number of tiled layers (i.e., overlapped genes) to display is 255 and the maximal number of RNA isoforms per gene to display is 20 by default, which can be further adjusted using the '--rnaMax' option with the 'GraphAnno' command in VRPG. 136 | 137 | For other annotation tracks defined in the BED format, different tracks are specified via the track line notation defined in the [BED](https://asia.ensembl.org/info/website/upload/bed.html) file. The maximal number of tiled layers to display is 50 by default, which can be further adjusted using the ‘--layer’ option with the 'GraphAnno' command in VRPG. When the 5th column (i.e., score) of the BED file is defined, VRPG will assume this annotation track is score-based and render all records on the same layer. Otherwise, VRPG will try to separate overlapping records from the same track into different layers. 138 | 139 | 140 | ``` 141 | # To add the gene annotation track based on the GFF3 file (based on the primary linear reference) 142 | # run 'GraphAnno addRef --help' for more help information 143 | ./module/GraphAnno addRef --inGFF gffFile --chrTrans chrTransFile --upDir upload 144 | 145 | # To add other annotation tracks from BED files 146 | ./module/GraphAnno addBed --inBed track.bed --upDir upload 147 | 148 | # To add annotation for all nodes (reference and non-reference). 149 | # run 'GraphAnno nodeGene --help' for more help information 150 | ./module/GraphAnno nodeGene --gffList gffListFile --upDir upload 151 | 152 | ``` 153 | 154 | ## Node sequence extraction 155 | Optionally, VRPG can be configured to report the exact genomic sequence for each node. 156 | ``` 157 | # To extract the node sequences and build their index files for VRPG 158 | ./module/nodeSeq --gfaFile graph.gfa --upDir upload 159 | 160 | ``` 161 | 162 | 163 | # Deployment 164 | ## For local server or personal computer running a Linux-based operating system (e.g., CentOS, Ubuntu) 165 | 166 | 1. Moving all files generated in directory 'upload' during data preparation to the empty directory named 'upload'. If users want to visualize more than one pangenome graphs, please see [TEST_README.md](https://github.com/codeatcg/VRPG/blob/main/test/TEST_README.md) for more details. 167 | 168 | 2. Starting the Django development server 169 | 170 | ``` 171 | python3 manage.py runserver 172 | 173 | If all is well you will see the output: 174 | Django version 3.2.4, using settings 'primers_project.settings' 175 | Starting development server at http://127.0.0.1:8000/ 176 | ``` 177 | 178 | 3. Open http://127.0.0.1:8000/app/vrpg/ in your web browser to access VRPG's web portal. 179 | 180 | **Note**: For large pangenome graphs, it is recommended to use a computing server for the data preparation of VRPG and then transfer the preprocessed data to the 'upload' directory of the local server or personal computer. 181 | 182 | ## For a remote server running a Linux-based operating system (e.g., CentOS, Ubuntu) 183 | 184 | 1. Logging in to the remote server and start the Django development server by running 185 | 186 | ``` 187 | python3 manage.py runserver 0.0.0.0:8000 188 | ``` 189 | 2. On a local computer, using http:\:8000/app/vrpg/ to use VRPG. 190 | 191 | **Note**: The port 8000 can be any port number that has not occupied by any other process and is allowed by the firewall for outside visiting. If the user is familiar with nginx or apache, VRPG can be further configured using any of them. 192 | 193 | 194 | # Run VRPG with the testing example 195 | Enter the directory [test](https://github.com/codeatcg/VRPG/tree/main/test) and follow the instructions from [TEST_README.md](https://github.com/codeatcg/VRPG/blob/main/test/TEST_README.md) 196 | 197 | 198 | # Additional Tips 199 | 1. For graphs generated by PGGB, the duplicate sequence in the primary linear reference assembly might have been collapsed. VRPG will re-insert shadow nodes and edges into the graph (by adding new L and S lines) to restore the linearity during graph indexing (See the associated manuscript for more technical details). When primary-linear-reference path highlighting is enabled, these inserted shadow nodes can be recognized by the fact that no highlighted path goes through them. Also, the node ID of these shadow nodes should be much larger compared with the real primary-linear-reference nodes aside to them. 200 | 201 | 202 | 2. For now, VRPG does not automatically clean the directories created by running 'Sequence-to-graph mapping' jobs (i.e., '/upload/\*/mapping/task_*') . To save the disk space of the hosting server, we suggest users (or the system manager) to clean these files periodically. 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | -------------------------------------------------------------------------------- /module/vgraph.h: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #define FIELDSIZE 64 12 | 13 | typedef struct{ 14 | char seqid[FIELDSIZE]; 15 | int start; 16 | int end; 17 | char geneID[FIELDSIZE]; 18 | char geneName[FIELDSIZE]; 19 | char strand; 20 | } AnnoLine; 21 | 22 | typedef struct{ 23 | long long offset; 24 | int num; 25 | } AnnoDx; 26 | 27 | typedef struct{ 28 | int node1; 29 | int node2; 30 | int reStart1; 31 | int reStart2; 32 | char name[FIELDSIZE]; 33 | uint8_t layer; 34 | char strand; 35 | char type; 36 | uint8_t num; 37 | } GeneNode; 38 | 39 | typedef struct{ 40 | int node1; 41 | int node2; 42 | int reStart1; 43 | int reStart2; 44 | char name[FIELDSIZE]; 45 | uint16_t layer; 46 | char strand; 47 | float score; 48 | } BedNode; 49 | 50 | typedef struct{ 51 | float start; 52 | float end; 53 | int layer; 54 | char strand; 55 | char margin; 56 | } FigGene; 57 | 58 | typedef struct{ 59 | float start; 60 | float end; 61 | float score; 62 | //int layer; 63 | char strand; 64 | std::string name; 65 | } FigBed; 66 | 67 | typedef struct{ 68 | float start; 69 | float end; 70 | int layer; 71 | char strand; 72 | uint8_t num; 73 | } FigEle; 74 | 75 | typedef struct{ 76 | long long offset; 77 | int len; 78 | } SeqDx; 79 | 80 | inline void asmSplit(const std::string &r_chr, const std::string &sep,std::string &tName,std::string &t_hap,std::string &tchr){ 81 | auto t_pos = r_chr.find(sep); 82 | if(t_pos != std::string::npos){ 83 | tName = r_chr.substr(0,t_pos); 84 | auto h_pos = t_pos + sep.length(); 85 | t_pos = r_chr.find(sep,t_pos+1); 86 | if(t_pos != std::string::npos){ 87 | t_hap = r_chr.substr(h_pos,t_pos-h_pos); 88 | tchr = r_chr.substr(t_pos + sep.length()); 89 | }else{ 90 | t_hap = "0"; 91 | tchr = r_chr.substr(h_pos); 92 | } 93 | }else{ 94 | tName = "REF"; 95 | t_hap = "HAP"; 96 | tchr = r_chr; 97 | } 98 | } 99 | 100 | inline char getMark(char ori_1,char ori_2, char sign='+'){ 101 | char mark = '0'; 102 | if(ori_1 == sign){ 103 | if(ori_2 == sign){ 104 | mark = '2'; 105 | }else{ 106 | mark = '3'; 107 | } 108 | }else{ 109 | if(ori_2 == sign){ 110 | mark = '4'; 111 | }else{ 112 | mark = '5'; 113 | } 114 | } 115 | return mark; 116 | } 117 | 118 | typedef int NodeType; 119 | 120 | inline char revMark(char mark){ 121 | char revVal = '0'; 122 | if(mark == '2'){ 123 | revVal = '5'; 124 | }else if(mark == '5'){ 125 | revVal = '2'; 126 | }else{ 127 | revVal = mark; 128 | } 129 | return revVal; 130 | } 131 | 132 | struct LEdge{ 133 | NodeType node1; 134 | NodeType node2; 135 | char mark; 136 | 137 | bool operator < (const LEdge& tedge)const{ 138 | if(node1 < tedge.node1){ 139 | return true; 140 | } 141 | if(node1 > tedge.node1){ 142 | return false; 143 | } 144 | 145 | if(node2 < tedge.node2){ 146 | return true; 147 | } 148 | if(node2 > tedge.node2){ 149 | return false; 150 | } 151 | 152 | if(mark < tedge.mark){ 153 | return true; 154 | } 155 | if(mark > tedge.mark){ 156 | return false; 157 | } 158 | return false; 159 | } 160 | 161 | }; 162 | 163 | typedef struct LEdge NEdge; 164 | 165 | typedef struct{ 166 | NodeType node1; 167 | NodeType node2; 168 | char mark; 169 | } CEdge; 170 | 171 | typedef struct{ 172 | NodeType node; 173 | char mark; 174 | } ENode; 175 | 176 | typedef struct{ 177 | int s_nid; 178 | int e_nid; 179 | int gNum; 180 | } Nid; 181 | 182 | typedef std::map Ndic; 183 | 184 | typedef struct{ 185 | int rByte; 186 | int ranNum; 187 | } ChrRange; 188 | 189 | typedef struct{ 190 | int ranStart; 191 | int ranEnd; 192 | long long offByte; 193 | int ranNum; 194 | 195 | } OneRange; 196 | 197 | typedef struct{ 198 | int len; 199 | int asmb; 200 | } LenAsm; 201 | 202 | struct INode{ 203 | int node; 204 | int start; 205 | int pend; 206 | int achr; 207 | 208 | bool operator < (const INode &tnode)const{ 209 | return node < tnode.node; 210 | } 211 | }; 212 | typedef struct INode SANode; 213 | 214 | typedef struct{ 215 | int start; 216 | int pend; 217 | int achr; 218 | } ANode; 219 | 220 | typedef struct{ 221 | int node; 222 | int start; 223 | int pend; 224 | } RNode; 225 | 226 | typedef struct{ 227 | char diff; 228 | char ori; 229 | } LagNode; 230 | 231 | struct PaRa{ 232 | int frag; 233 | int pathNum; 234 | int firNode; 235 | char firOri; 236 | std::list lag; 237 | std::list posList; 238 | std::list cigarList; 239 | bool operator < (const PaRa &pr)const{ 240 | return frag < pr.frag; 241 | } 242 | }; 243 | 244 | typedef PaRa PathRang; 245 | 246 | typedef struct { 247 | long long rOffsize; 248 | int rCount; 249 | } EdRang; 250 | 251 | typedef struct{ 252 | int start; 253 | int pend; 254 | } RanPos; 255 | 256 | typedef struct{ 257 | int pnum; 258 | int start; 259 | int end; 260 | int loc; 261 | } PathPos; 262 | 263 | bool numSort(PathPos &a, PathPos &b); 264 | 265 | std::string getSep(std::string &sepFile); 266 | 267 | class GraphRange{ 268 | 269 | public: 270 | GraphRange(std::string &t_upDir,int index); 271 | std::vector draw_node; 272 | std::vector draw_pos; 273 | std::vector dnode_len; 274 | std::vector > draw_edge; 275 | std::vector genome; 276 | std::vector nnames; 277 | std::vector hnGroup; 278 | std::vector hLinks; 279 | std::vector hDir; 280 | std::vector hEdgeAsm; 281 | 282 | //annotation gene 283 | std::vector ndGenePos; 284 | std::vector geneVec; 285 | std::vector layerVec; 286 | std::vector strandVec; 287 | std::vector mgFlagVec; 288 | 289 | //annotation exon 290 | std::vector ndExonPos; 291 | std::vector rnaVec; 292 | std::vector eLayerVec; 293 | std::vector eStrandVec; 294 | std::vector eFlagVec; 295 | std::vector eNumVec; 296 | 297 | //annotation cds 298 | std::vector ndCDSPos; 299 | std::vector cdsVec; 300 | std::vector cLayerVec; 301 | std::vector cNumVec; 302 | 303 | //annotation track in bed format 304 | std::vector tkNameVec; 305 | std::vector tkDesVec; 306 | std::vector tkColVec; 307 | std::vector tkCumVec; 308 | std::vector tkItem; 309 | std::vector rBedPos; 310 | std::vector rBedName; 311 | std::vector rBedLayer; 312 | std::vector rBedScore; 313 | std::vector rBedStrand; 314 | 315 | std::vector tickValue; 316 | std::vector tickPos; 317 | 318 | float figScale; 319 | 320 | //alignment 321 | std::vector qChr; 322 | std::vector qStart; 323 | std::vector qEnd; 324 | std::vector qPath; 325 | std::vector qCigar; 326 | 327 | void formatGraph(std::string &asmb,std::string &sChr,int sStart,int sEnd,int ex,int wStart,int wWidth,int wCut,int wY,int queryDep,int varLen,bool sim,bool refSim); 328 | void edgeWrite(std::string &spChrFile,int rangeSize,int ex,int nocross,int nthread,int storeDep); 329 | 330 | private: 331 | std::string nodeFile,edgeFile,pathDir,asmFile,chrFile,comChrFile,sepFile; 332 | std::string upDir; 333 | std::string sep; 334 | 335 | int indexFlag; 336 | // 337 | void conformEdge(NodeType &node1,NodeType &node2,char mark,std::unordered_map > &iedge,std::unordered_map > &oedge); 338 | //---------- read unindexed dada ------------------- 339 | void parseEdge(std::unordered_map > &iedge,std::unordered_map > &oedge); 340 | void parseNode(std::string &sChr,int sStart,int sEnd,int ex,std::vector &rangeNode,std::unordered_map &exNode,std::unordered_map &info,int &realLen); 341 | // 342 | void hAsmNode(std::string &taskDir,bool refSim,std::unordered_map &info,int asmNum,std::map &r_edge_dict,std::unordered_map &nid_dict); 343 | void hEdge2fig(std::unordered_map &nid_dict,std::map &h_edge_dict,std::map &r_edge_dict); 344 | void hAsmNode2(bool refSim,std::vector &asmNumVec,std::map &r_edge_dict,std::unordered_map &nid_dict); 345 | 346 | //---------- for sequence alignment ------------------- 347 | bool f_cigar2pos(int qStart,int rStart,std::unordered_map &info,std::string &cigar,std::vector &nodes,std::vector &qCigarVec,std::vector &qPosVec); 348 | bool f_path2pos(int rStart,std::unordered_map &info,std::vector &nodes,std::vector &qPosVec); 349 | 350 | //---------- intermediate functions used by functions to deal with both indexed and unindexed data ------------------- 351 | std::string subCigar(std::string &rgCigar,int ndStart,int ndEnd); 352 | /* 353 | Highlight assembly path: 354 | visCigar is true. 355 | 356 | Highlight query path (sequence alignment): 357 | visCigar is true if all nodes that composing the query sequence are in the indexed graph, 358 | or visCigar is false. 359 | */ 360 | void eAsmFind(bool visCigar,bool refSim,std::vector &orient,std::vector &nodes,std::vector &qPosVec,std::string &rgCigar,std::string &rgName,std::map &r_edge_dict,std::unordered_map &nid_dict,std::unordered_set &ndGroup); 361 | void eAsmFind2(bool refSim,int asmCode,std::vector &orient,std::vector &nodes,std::unordered_map &nid_dict,std::unordered_set &ndGroup,std::map &h_edge_dict); 362 | int getChrNum(std::string &sChr); 363 | int getAsmNum(std::string &asmb); 364 | // get code number for a list of assemblies 365 | void getAsmNum2(std::set &nameSet,std::vector &asmNumVec); 366 | 367 | //---------- read indexed data ------------------- 368 | // read indexed edge data 369 | void parseIndex(int chrNum,int sStart,int sEnd,std::unordered_map > &iedge,std::unordered_map > &oedge); 370 | // read indexed node data 371 | void getExNode(int chrNum,int sStart,int sEnd,int ex,std::vector &rangeNode,std::unordered_map &exNode,std::unordered_map &info,int &realLen,int &realStart); 372 | // read index path dada 373 | void queryDbPath(bool formR,int asmNum,int chrNum,int sStart,int sEnd,std::unordered_map &info,std::vector > &oriMulti,std::vector > &nodeMulti,std::vector > &qPosMulti,std::vector &cigarMulti,std::vector &pNumMulti); 374 | 375 | void dxAsmNode(bool refSim,int asmNum,int chrNum,int sStart,int sEnd,std::unordered_map &info,std::map &r_edge_dict,std::unordered_map &nid_dict); 376 | void dxAsmNode2(bool refSim,std::vector &asmVec,int chrNum,int sStart,int sEnd,std::unordered_map &info,std::map &r_edge_dict,std::unordered_map &nid_dict); 377 | 378 | //---------- create indexes for node, edge and path data ------------------- 379 | void splitRange(int rangeNum,std::unordered_map &chrMap,std::unordered_map &refChrMap,std::string &rndDxFile,std::string &rndFile,std::string &nspecFile,std::string &snFile); 380 | // 381 | void getNrefEdge(std::string &rndFile,std::string &nspecFile,std::vector &resEdge); 382 | void getChrRmEdge(std::unordered_set &ntNode,std::vector &chrRmEdge); 383 | 384 | void parseRange(std::vector &chrRnode,std::vector &arcVec,int sStart,int sEnd,int ex,std::vector &rangeNode,std::unordered_set &exNode); 385 | void edgeRange(std::vector &chrRnode,std::vector &arcVec,int sStart,int sEnd,int ex,int nocross,int storeDep,std::vector &chrRmEdge,std::unordered_map > &iedge,std::unordered_map > &oedge,std::set &r_edge_dict,std::unordered_set &nRefNode); 386 | 387 | void oneTask(std::unordered_map > &iedge,std::unordered_map > &oedge,std::vector &chrRnode,std::vector &acrVec,std::vector &chrRmEdge, 388 | int ex,int nocross,int frStart,int frEnd,int storeDep,std::ofstream &tndfh,std::ofstream &tbfh,int *frNrefNum,int *frEdgeNum 389 | ); 390 | // 391 | void fillNode(std::string &comChrFile,std::string &ndAsmLFile,std::string &nrNodeFile,std::string &nrNumFile,std::string &snFile,std::string &nrdFile); 392 | void mergeDx(std::string &rndDxFile,std::string &nrNumFile,std::string &mgDxFile); 393 | // 394 | void pthTask(bool formR,std::vector &allLen,std::unordered_map > &ndCutMap,std::vector &allpos,char *header,int dxByte,int frStart,int frEnd,std::vector &pthVec,std::vector &xpthVec,std::vector &wpthVec); 395 | void rangePath(bool formR,int num,std::vector &qCigarVec,std::vector &qPosVec,std::vector &orient,std::vector &nodes,std::unordered_map > &ndCutMap,std::list &allPaRa); 396 | void indexPath(bool formR,std::string &asmFile,std::string &eIndexFile,std::string &bEdgeFile,std::string &snFile,int nthread); 397 | 398 | void getAllLen(std::string &snFile,std::vector &allLen); 399 | void cigar2pos(int qStart,int rStart,std::vector &allLen,std::string &cigar,std::vector &nodes,std::vector &qCigarVec,std::vector &qPosVec); 400 | void path2pos(int rStart,std::vector &allLen,std::vector &nodes,std::vector &qPosVec); 401 | //---------- read indexed annotation data ------------------- 402 | void readRefGene(std::string &ovFile,std::string &gDxFile,int chrNum,int sStart,int sEnd,std::unordered_set &retainID,std::unordered_map &exNode,std::vector &refNodeGene,std::unordered_set midExon); 403 | void getFigGene(std::string &bwGeneFile,std::string &gDxFile,int chrNum,int sStart,int sEnd,float wPerK,std::unordered_map &exNode); 404 | 405 | void readRefBed(std::string &ovFile,std::string &gDxFile,int chrNum,int sStart,int sEnd,std::unordered_set &retainID,std::unordered_map &exNode,std::vector &refBedNode); 406 | void getFigBed(std::string &tkDesFile,std::string &bwGeneFile,std::string &gDxFile,int chrNum,int sStart,int sEnd,float wPerK,std::unordered_map &exNode); 407 | }; 408 | 409 | // 410 | class QueryNode{ 411 | 412 | public: 413 | QueryNode(std::string &t_dbDir); 414 | std::vector ndCov; 415 | std::string nodeAsm,nodeChr; 416 | int nodeStart,nodeEnd; 417 | std::vector > geneList; 418 | 419 | std::string nodeSeq; 420 | 421 | void fetchNdSeq(int node); 422 | void queryDbNode(int node); 423 | void queryDbCov(int node); 424 | void queryGene(int node,std::string &nodeAsm); 425 | void queryAsmCov(std::vector &nodeVec,std::string &asmb); 426 | private: 427 | int node; 428 | std::string dbDir,comChrFile,asmFile,sepFile,sep; 429 | std::vector header; 430 | void getHeader(); 431 | int countHeader(); 432 | }; 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | -------------------------------------------------------------------------------- /module/gaf2rbed.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "vgraph.h" 9 | 10 | using namespace std; 11 | 12 | // 13 | // 14 | void readEdge(char *edgeFile,unordered_map > &oedge,unordered_map > &iedge){ 15 | ifstream in(edgeFile); 16 | if(! in){ 17 | cerr<<"Error: file open failed. "< >::iterator it; 24 | while(getline(in,strLine)){ 25 | strStream << strLine; 26 | NodeType node1,node2; 27 | char sign1,sign2; 28 | strStream >> node1; 29 | strStream >> node2; 30 | strStream >> sign1; 31 | strStream >> sign2; 32 | 33 | it = oedge.find(node1); 34 | if(it != oedge.end()){ 35 | (it->second).push_back(node2); 36 | }else{ 37 | vector tvec{node2}; 38 | oedge.emplace(node1,tvec); 39 | } 40 | 41 | it = iedge.find(node2); 42 | if(it != iedge.end()){ 43 | (it->second).push_back(node1); 44 | }else{ 45 | vector tvec{node1}; 46 | iedge.emplace(node2,tvec); 47 | } 48 | 49 | strStream.clear(); 50 | strStream.str(""); 51 | } 52 | in.close(); 53 | } 54 | 55 | // 56 | void readRnode(char *rndFile,char *rndDxFile,unordered_map &refNdMap){ 57 | ifstream dfh(rndDxFile); 58 | if(! dfh){ 59 | cerr<<"Error: file open failed. "< chrCnVec; 78 | chrCnVec.reserve(nchr); 79 | 80 | vector chrVec; 81 | chrVec.reserve(nchr); 82 | 83 | vector chrNdVec; 84 | chrNdVec.reserve(nchr); 85 | for(int x = 0; x < nchr; ++x){ 86 | dfh.read((char *)&refChr,intSize); 87 | dfh.read((char *)&crRange,crSize); 88 | chrVec.push_back(refChr); 89 | chrCnVec.push_back(crRange.ranNum); 90 | } 91 | 92 | int oneSize = sizeof(OneRange); 93 | for(int nc : chrCnVec){ 94 | int n = 0; 95 | for(int k = 0; k < nc; ++k){ 96 | OneRange tRange; 97 | dfh.read((char *)&tRange,oneSize); 98 | n += tRange.ranNum; 99 | } 100 | chrNdVec.push_back(n); 101 | } 102 | // 103 | int r_node,r_start,r_end; 104 | int j = 0; 105 | for(int nd : chrNdVec){ 106 | int tchr = chrVec[j]; 107 | for(int k = 0; k < nd; ++k){ 108 | nfh.read((char *)&r_node,intSize); 109 | nfh.read((char *)&r_start,intSize); 110 | nfh.read((char *)&r_end,intSize); 111 | ANode tn = {r_start,r_end,tchr}; 112 | refNdMap.emplace(r_node,tn); 113 | } 114 | ++j; 115 | } 116 | // 117 | dfh.close(); 118 | nfh.close(); 119 | } 120 | 121 | void readChr(char *rChrFile,vector &chrVec){ 122 | ifstream cfh(rChrFile); 123 | if(! cfh){ 124 | cerr<<"Error: file open failed. "< &refNdMap,unordered_map > &oedge,unordered_map > &iedge,vector &selNode,map &chrRanMap){ 138 | unordered_map >::iterator it; 139 | int queryDep = 500; 140 | 141 | for(NodeType tnode : selNode){ 142 | it = oedge.find(tnode); 143 | vector tNref; 144 | vector deep; 145 | bool findRef = false; 146 | 147 | set travNode; 148 | travNode.insert(tnode); 149 | if(it != oedge.end()){ 150 | for(NodeType &o_node : it->second){ 151 | if(travNode.find(o_node) == travNode.end()){ 152 | if(refNdMap.find(o_node) != refNdMap.end()){ 153 | ANode &tRef = refNdMap[o_node]; 154 | if(chrRanMap.find(tRef.achr) != chrRanMap.end()){ 155 | RanPos &tRan = chrRanMap[tRef.achr]; 156 | if(tRan.start < tRef.start){ 157 | tRan.pend = tRef.pend; 158 | }else{ 159 | tRan.start = tRef.start; 160 | } 161 | }else{ 162 | RanPos rp = {tRef.start,tRef.pend}; 163 | chrRanMap.emplace(tRef.achr,rp); 164 | } 165 | findRef = true; 166 | break; 167 | } 168 | tNref.push_back(o_node); 169 | deep.push_back(0); 170 | // 171 | travNode.insert(tnode); 172 | } 173 | } 174 | } 175 | 176 | if(findRef){ 177 | break; 178 | } 179 | it = iedge.find(tnode); 180 | if(it != iedge.end()){ 181 | for(NodeType &o_node : it->second){ 182 | if(travNode.find(o_node) == travNode.end()){ 183 | if(refNdMap.find(o_node) != refNdMap.end()){ 184 | ANode &tRef = refNdMap[o_node]; 185 | if(chrRanMap.find(tRef.achr) != chrRanMap.end()){ 186 | RanPos &tRan = chrRanMap[tRef.achr]; 187 | if(tRan.start < tRef.start){ 188 | tRan.pend = tRef.pend; 189 | }else{ 190 | tRan.start = tRef.start; 191 | } 192 | }else{ 193 | RanPos rp = {tRef.start,tRef.pend}; 194 | chrRanMap.emplace(tRef.achr,rp); 195 | } 196 | findRef = true; 197 | break; 198 | } 199 | tNref.push_back(o_node); 200 | deep.push_back(0); 201 | // 202 | travNode.insert(tnode); 203 | } 204 | } 205 | } 206 | if(findRef){ 207 | break; 208 | } 209 | if(! tNref.empty()){ 210 | size_t i = 0; 211 | while(i < tNref.size()){ 212 | if(deep[i] > queryDep){ 213 | break; 214 | } 215 | 216 | it = oedge.find(tNref[i]); 217 | for(NodeType &o_node : it->second){ 218 | if(travNode.find(o_node) == travNode.end()){ 219 | if(refNdMap.find(o_node) != refNdMap.end()){ 220 | ANode &tRef = refNdMap[o_node]; 221 | if(chrRanMap.find(tRef.achr) != chrRanMap.end()){ 222 | RanPos &tRan = chrRanMap[tRef.achr]; 223 | if(tRan.start < tRef.start){ 224 | tRan.pend = tRef.pend; 225 | }else{ 226 | tRan.start = tRef.start; 227 | } 228 | }else{ 229 | RanPos rp = {tRef.start,tRef.pend}; 230 | chrRanMap.emplace(tRef.achr,rp); 231 | } 232 | findRef = true; 233 | break; 234 | } 235 | tNref.push_back(o_node); 236 | deep.push_back(deep[i]+1); 237 | // 238 | travNode.insert(tnode); 239 | } 240 | } 241 | 242 | if(findRef){ 243 | break; 244 | } 245 | 246 | it = iedge.find(tNref[i]); 247 | for(NodeType &o_node : it->second){ 248 | if(travNode.find(o_node) == travNode.end()){ 249 | if(refNdMap.find(o_node) != refNdMap.end()){ 250 | ANode &tRef = refNdMap[o_node]; 251 | if(chrRanMap.find(tRef.achr) != chrRanMap.end()){ 252 | RanPos &tRan = chrRanMap[tRef.achr]; 253 | if(tRan.start < tRef.start){ 254 | tRan.pend = tRef.pend; 255 | }else{ 256 | tRan.start = tRef.start; 257 | } 258 | }else{ 259 | RanPos rp = {tRef.start,tRef.pend}; 260 | chrRanMap.emplace(tRef.achr,rp); 261 | } 262 | findRef = true; 263 | break; 264 | } 265 | tNref.push_back(o_node); 266 | deep.push_back(deep[i]+1); 267 | // 268 | travNode.insert(tnode); 269 | } 270 | } 271 | 272 | if(findRef){ 273 | break; 274 | } 275 | 276 | i += 1; 277 | } 278 | } 279 | } 280 | // 281 | } 282 | 283 | void writeBed(vector &chrVec,map &chrRanMap,char *rChrFile,ofstream &ofh){ 284 | int n = chrVec.size(); 285 | for(auto &aran : chrRanMap){ 286 | if(aran.first < n){ 287 | ofh< &refNdMap,string &path,vector &selNode,map &chrRanMap){ 293 | int snode = 0, firNode = 0, lastNode = 0; 294 | //int pmin = 0, pmax = 0; 295 | unordered_map::iterator it; 296 | bool findRef = false; 297 | bool fir = true; 298 | 299 | for(size_t i = 0; i < path.length(); ++i){ 300 | if(path[i] == '>' || path[i] == '<'){ 301 | if(i > 0){ 302 | if(fir){ 303 | firNode = snode; 304 | fir = false; 305 | } 306 | it = refNdMap.find(snode); 307 | if(it != refNdMap.end()){ 308 | findRef = true; 309 | // 310 | if(chrRanMap.find((it->second).achr) != chrRanMap.end()){ 311 | RanPos &tRan = chrRanMap[(it->second).achr]; 312 | if(tRan.start < (it->second).start){ 313 | tRan.pend = (it->second).pend; 314 | }else{ 315 | tRan.start = (it->second).start; 316 | } 317 | }else{ 318 | RanPos rp = {(it->second).start,(it->second).pend}; 319 | chrRanMap.emplace((it->second).achr,rp); 320 | } 321 | } 322 | snode = 0; 323 | } 324 | }else{ 325 | snode = snode * 10 + (path[i] - '0'); 326 | } 327 | } 328 | // 329 | if(fir){ 330 | firNode = snode; 331 | } 332 | lastNode = snode; 333 | it = refNdMap.find(snode); 334 | if(it != refNdMap.end()){ 335 | findRef = true; 336 | // 337 | if(chrRanMap.find((it->second).achr) != chrRanMap.end()){ 338 | RanPos &tRan = chrRanMap[(it->second).achr]; 339 | if(tRan.start < (it->second).start){ 340 | tRan.pend = (it->second).pend; 341 | }else{ 342 | tRan.start = (it->second).start; 343 | } 344 | }else{ 345 | RanPos rp = {(it->second).start,(it->second).pend}; 346 | chrRanMap.emplace((it->second).achr,rp); 347 | } 348 | } 349 | // 350 | if(! findRef){ 351 | selNode.push_back(firNode); 352 | if(lastNode != firNode){ 353 | selNode.push_back(lastNode); 354 | } 355 | } 356 | return findRef; 357 | 358 | } 359 | 360 | void gaf2rbed(char *rChrFile,char *rndFile,char *rndDxFile,char *edgeFile,char *pathFile,char *outFile){ 361 | ifstream pfh(pathFile); 362 | if(! pfh){ 363 | cerr<<"Error: file open failed. "< refNdMap; 376 | readRnode(rndFile,rndDxFile,refNdMap); 377 | vector chrVec; 378 | readChr(rChrFile,chrVec); 379 | unordered_map > oedge,iedge; 380 | // 381 | string pLine,pName,path; 382 | stringstream strStream; 383 | while(getline(pfh,pLine)){ 384 | strStream << pLine; 385 | strStream >> pName; 386 | strStream >> path; 387 | 388 | strStream.clear(); 389 | strStream.str(""); 390 | // 391 | vector selNode; 392 | map chrRanMap; 393 | bool findRef = pathPos(refNdMap,path,selNode,chrRanMap); 394 | if(! findRef){ 395 | if(useEdge){ 396 | readEdge(edgeFile,oedge,iedge); 397 | useEdge = false; 398 | } 399 | searchPos(refNdMap,oedge,iedge,selNode,chrRanMap); 400 | } 401 | // 402 | writeBed(chrVec,chrRanMap,rChrFile,ofh); 403 | } 404 | // 405 | pfh.close(); 406 | ofh.close(); 407 | } 408 | 409 | void ga2bd_usage(){ 410 | cout<<"Usage: gaf2bed --chr --rnode --dxnode --edge --path --out "< chromosome list of reference genome."< graph nodes from reference genome."< index of '--rnode' file."< edge file."< query path."< output file."<'): 97 | tarr = pat.split(faLine) 98 | preName = tarr[0].replace('>','>' + arr[0] + sep) 99 | tfh.write(str.encode(preName + "\n")) 100 | else: 101 | tfh.write(str.encode(faLine)) 102 | else: 103 | for faLine in fh: 104 | if faLine.startswith('>'): 105 | tarr = pat.split(faLine) 106 | preName = tarr[0].replace('>','>' + arr[0] + sep) 107 | tfh.write(preName + "\n") 108 | else: 109 | tfh.write(faLine) 110 | 111 | tfh.close() 112 | fh.close() 113 | return {"nameList":nameList,"dotList":dotList,"suffix":suffix} 114 | 115 | def rgraph(dotList,suffix,minigraph,graphOpt,thread,outDir): 116 | newAsmDir = os.path.join(outDir,"newAsmDir") 117 | newAsmList = [ os.path.join(newAsmDir, x + y) for x,y in zip(dotList,suffix)] 118 | allFa = " ".join(newAsmList) 119 | 120 | graphDir = os.path.join(outDir,"newGraph") 121 | os.mkdir(graphDir) 122 | gfaFile = os.path.join(graphDir,"out.gfa") 123 | 124 | command = "" 125 | if graphOpt is not None: 126 | command = minigraph + " -cxggs -t " + thread + " " + graphOpt + " " + allFa + " > " + gfaFile 127 | else: 128 | command = minigraph + " -cxggs -t " + thread + " " + allFa + " > " + gfaFile 129 | try: 130 | os.system(command) 131 | except: 132 | print("Error: Graph create failed. Please check the options!") 133 | 134 | def mapAsm(nameList,dotList,suffix,minigraph,thread,outDir): 135 | newAsmDir = os.path.join(outDir,"newAsmDir") 136 | newAsmList = [ os.path.join(newAsmDir, x + y) for x,y in zip(dotList,suffix)] 137 | 138 | mapDir = os.path.join(outDir,"mapDir") 139 | os.mkdir(mapDir) 140 | mapListFile = os.path.join(mapDir,"gaf.list") 141 | 142 | graphDir = os.path.join(outDir,"newGraph") 143 | gfaFile = os.path.join(graphDir,"out.gfa") 144 | 145 | with open(mapListFile,'w') as fh: 146 | #for faFile,name in zip(newAsmList,dotList): 147 | endPoint = len(dotList) 148 | for i in range(endPoint): 149 | outGAF = os.path.join(mapDir,dotList[i] + ".gaf") 150 | command = minigraph + " -cxasm -t " + thread + " --vc " + gfaFile + " " + newAsmList[i] + " -o " + outGAF 151 | os.system(command) 152 | 153 | fh.write(nameList[i] + "\t" + outGAF + "\n") 154 | 155 | def reduceGFA(sep,gfaFile,nodeFile,edgeFile,comChrFile,ndAsmLFile): 156 | flag = False 157 | if gfaFile.endswith(".gz"): 158 | fh = gzip.open(gfaFile,"rb") 159 | flag = True 160 | else: 161 | fh = open(gfaFile) 162 | 163 | comChrSet = set() 164 | asmSet = set() 165 | 166 | with open(nodeFile,'w') as nfh, open(edgeFile,'w') as efh, open(comChrFile,'w') as cfh, open(ndAsmLFile,'w') as afh: 167 | nfh.write("#Segment\tChr\tStart\tEnd\tLen\tRefOrNot\n") 168 | efh.write("#Source\tTarget\tOrigin1\tOrigin2\n") 169 | for line in fh: 170 | if flag: 171 | line = bytes.decode(line) 172 | line = line.strip() 173 | arr = line.split("\t") 174 | if arr[0] == 'S': 175 | segment = arr[1].replace("s","") 176 | slen = arr[3].split(":")[2] 177 | schr = arr[4].split(":")[2] 178 | 179 | tAsm = "" 180 | if sep not in schr: 181 | tAsm = "REF" + sep + "0" 182 | schr = tAsm + sep + schr 183 | else: 184 | asmArr = schr.split(sep) 185 | tAsm = asmArr[0] + sep + asmArr[1] 186 | 187 | start = arr[5].split(":")[2] 188 | off_start = str(int(start) + 1) 189 | end = str(int(start) + int(slen)) 190 | refOrNot = arr[6].split(":")[2] 191 | nfh.write(segment + "\t" + schr + "\t" + off_start + "\t" + end +"\t" + slen + "\t" + refOrNot +"\n") 192 | 193 | if schr not in comChrSet: 194 | comChrSet.add(schr) 195 | 196 | if tAsm not in asmSet: 197 | asmSet.add(tAsm) 198 | afh.write(tAsm + "\n") 199 | 200 | elif arr[0] == 'L': 201 | source = arr[1].replace("s","") 202 | origin1 = arr[2] 203 | target = arr[3].replace("s","") 204 | origin2 = arr[4] 205 | efh.write(source + "\t" + target + "\t" + origin1 + "\t" + origin2 + "\n") 206 | 207 | for tchr in sorted(comChrSet): 208 | cfh.write(tchr + "\n") 209 | 210 | #for asm in sorted(asmSet): 211 | # afh.write(asm + "\n") 212 | 213 | fh.close() 214 | 215 | 216 | def refChrList(sep,nodeFile,outChrList): 217 | with open(nodeFile) as nf,open(outChrList,'w') as cl: 218 | #cl.write("#Chr\tStart\tEnd\n") 219 | preChr = "" 220 | tStart = 1 221 | tEnd = 1 222 | for line in nf: 223 | if line.startswith('#'): 224 | continue 225 | line = line.strip() 226 | arr = line.split("\t") 227 | if arr[5] != "0": 228 | #tchr = preChr.split(sep)[-1] 229 | chrArr = preChr.split(sep) 230 | tchr = chrArr[-1] 231 | cl.write(tchr + "\t" + tStart + "\t" + tEnd + "\n") 232 | refAsm = chrArr[0] + sep + chrArr[1] 233 | return refAsm 234 | #break 235 | else: 236 | if preChr == "": 237 | tStart = arr[2] 238 | tEnd = arr[3] 239 | else: 240 | if arr[1] != preChr: 241 | tchr = preChr.split(sep)[-1] 242 | cl.write(tchr + "\t" + tStart + "\t" + tEnd + "\n") 243 | tStart = arr[2] 244 | tEnd = arr[3] 245 | else: 246 | tEnd = arr[3] 247 | preChr = arr[1] 248 | 249 | def simpPath(nodeFile,gafList,minMQ,asmLFile,pathDir): 250 | with open(nodeFile) as nf: 251 | nodeSize = {} 252 | for line in nf: 253 | if line.startswith('#'): 254 | continue 255 | line = line.strip() 256 | arr = line.split("\t") 257 | nodeSize[arr[0]] = int(arr[4]) 258 | 259 | #asmSet = set() 260 | covInfo = {} 261 | pattern = re.compile("[><]") 262 | pat = re.compile("\s+") 263 | pat2 = re.compile("^\s*$") 264 | nAsm = 0 265 | with open(gafList) as gl,open(asmLFile,'w') as af: 266 | for line in gl: 267 | if line.startswith('#'): 268 | continue 269 | if pat2.match(line): 270 | continue 271 | line = line.strip() 272 | arr = pat.split(line) 273 | covInfo[arr[0]] = {} 274 | af.write(arr[0] + "\n") 275 | pathFile = os.path.join(pathDir,str(nAsm) + ".path") 276 | pNameFile = os.path.join(pathDir,str(nAsm) + ".name") 277 | nAsm += 1 278 | with open(arr[1]) as gf,open(pathFile,'w') as pf,open(pNameFile,'w') as nf: 279 | pathDict = {} 280 | for mapinfo in gf: 281 | mapinfo = mapinfo.strip() 282 | mapArr = mapinfo.split("\t") 283 | if int(mapArr[11]) > minMQ: 284 | mapStr = mapArr[5].replace("s","") 285 | #pf.write(mapArr[0]+"\t"+mapStr+"\n") 286 | if mapArr[0] not in pathDict: 287 | pathDict[mapArr[0]] = {} 288 | pathDict[mapArr[0]][int(mapArr[2])] = mapStr + "\t" + mapArr[2] + "\t" + mapArr[7] + "\t" + mapArr[18].split(":")[2] 289 | 290 | for k in pathDict: 291 | sortPath = sorted(pathDict[k].items(),key=lambda x : x[0]) 292 | for tpath in sortPath: 293 | pf.write(k+"\t"+tpath[1]+"\n") 294 | nf.write(k+"\n") 295 | 296 | def nodeCov(nodeFile,gafList,minMQ,covFile,asmLFile,pathDir): 297 | with open(nodeFile) as nf: 298 | nodeSize = {} 299 | for line in nf: 300 | if line.startswith('#'): 301 | continue 302 | line = line.strip() 303 | arr = line.split("\t") 304 | nodeSize[arr[0]] = int(arr[4]) 305 | 306 | #asmSet = set() 307 | covInfo = {} 308 | pattern = re.compile("[><]") 309 | pat = re.compile("\s+") 310 | pat2 = re.compile("^\s*$") 311 | nAsm = 0 312 | with open(gafList) as gl,open(asmLFile,'w') as af: 313 | for line in gl: 314 | if line.startswith('#'): 315 | continue 316 | if pat2.match(line): 317 | continue 318 | line = line.strip() 319 | arr = pat.split(line) 320 | covInfo[arr[0]] = {} 321 | af.write(arr[0] + "\n") 322 | pathFile = os.path.join(pathDir,str(nAsm) + ".path") 323 | pNameFile = os.path.join(pathDir,str(nAsm) + ".name") 324 | nAsm += 1 325 | with open(arr[1]) as gf,open(pathFile,'w') as pf,open(pNameFile,'w') as nf: 326 | pathDict = {} 327 | for mapinfo in gf: 328 | mapinfo = mapinfo.strip() 329 | mapArr = mapinfo.split("\t") 330 | if int(mapArr[11]) > minMQ: 331 | mapStr = mapArr[5].replace("s","") 332 | #pf.write(mapArr[0]+"\t"+mapStr+"\n") 333 | if mapArr[0] not in pathDict: 334 | pathDict[mapArr[0]] = {} 335 | pathDict[mapArr[0]][int(mapArr[2])] = mapStr + "\t" + mapArr[2] + "\t" + mapArr[7] + "\t" + mapArr[18].split(":")[2] 336 | 337 | nodeArr = pattern.split(mapStr) 338 | nodeCount = len(nodeArr) 339 | if nodeCount < 3: 340 | #covInfo[arr[0]][nodeArr[1]] = (int(mapArr[8]) - int(mapArr[7]) + 1) / nodeSize[nodeArr[1]] 341 | covInfo[arr[0]][nodeArr[1]] = (int(mapArr[8]) - int(mapArr[7])) / nodeSize[nodeArr[1]] 342 | else: 343 | for i in range(1,nodeCount): 344 | if i == 1: 345 | if nodeArr[1] in covInfo[arr[0]]: 346 | covInfo[arr[0]][nodeArr[1]] += (nodeSize[nodeArr[1]] - int(mapArr[7])) / nodeSize[nodeArr[1]] 347 | else: 348 | covInfo[arr[0]][nodeArr[1]] = (nodeSize[nodeArr[1]] - int(mapArr[7])) / nodeSize[nodeArr[1]] 349 | elif i == nodeCount - 1: 350 | if nodeArr[i] in covInfo[arr[0]]: 351 | #covInfo[arr[0]][nodeArr[i]] += (nodeSize[nodeArr[i]] + int(mapArr[8]) + 1 - int(mapArr[6])) / nodeSize[nodeArr[i]] 352 | covInfo[arr[0]][nodeArr[i]] += (nodeSize[nodeArr[i]] + int(mapArr[8]) - int(mapArr[6])) / nodeSize[nodeArr[i]] 353 | else: 354 | #covInfo[arr[0]][nodeArr[i]] = (nodeSize[nodeArr[i]] + int(mapArr[8]) + 1 - int(mapArr[6])) / nodeSize[nodeArr[i]] 355 | covInfo[arr[0]][nodeArr[i]] = (nodeSize[nodeArr[i]] + int(mapArr[8]) - int(mapArr[6])) / nodeSize[nodeArr[i]] 356 | 357 | else: 358 | if nodeArr[i] in covInfo[arr[0]]: 359 | covInfo[arr[0]][nodeArr[i]] += 1.00 360 | else: 361 | covInfo[arr[0]][nodeArr[i]] = 1.00 362 | for k in pathDict: 363 | sortPath = sorted(pathDict[k].items(),key=lambda x : x[0]) 364 | for tpath in sortPath: 365 | pf.write(k+"\t"+tpath[1]+"\n") 366 | nf.write(k+"\n") 367 | 368 | allNodes = nodeSize.keys() 369 | allAsm = covInfo.keys() 370 | numLimit = len(allAsm) / 2 371 | with open(covFile,"w") as cf: 372 | #strAsm = "\t".join(allAsm) 373 | #cf.write("#Segid\t" + strAsm + "\n") 374 | for tnode in allNodes: 375 | oneList = [] 376 | thList = [] 377 | vthList = [] 378 | i = 0 379 | allValue = [] 380 | for asmb in allAsm: 381 | if tnode in covInfo[asmb]: 382 | if covInfo[asmb][tnode] > 0.99 and covInfo[asmb][tnode] < 1.01: 383 | oneList.append(i) 384 | elif covInfo[asmb][tnode] > 0: 385 | thList.append(i) 386 | vthList.append("%.2f" % covInfo[asmb][tnode]) 387 | allValue.append("%.2f" % covInfo[asmb][tnode]) 388 | else: 389 | allValue.append(0.00) 390 | i += 1 391 | 392 | strCov = "" 393 | num = len(oneList) + len(thList) 394 | if num < numLimit: 395 | if len(oneList) > 0: 396 | strCov += ",".join([str(x) for x in oneList]) 397 | else: 398 | strCov = "*" 399 | if len(thList) > 0: 400 | strCov += ("\t" + ",".join([str(x) for x in thList])) 401 | strCov += ("\t" + ",".join([str(x) for x in vthList])) 402 | else: 403 | strCov = ",".join([str(x) for x in allValue]) 404 | cf.write(tnode + "\t" + strCov + "\n") 405 | 406 | def wSep(sep,sepFile): 407 | with open(sepFile,'w') as sf: 408 | sf.write(sep + "\n") 409 | 410 | def fromScratch(ncalCov,sep,asmList,minigraph,graphOpt,thread,minMQ,outDir): 411 | asmInfo = changeHeader(sep,asmList,outDir) 412 | rgraph(asmInfo["dotList"],asmInfo["suffix"],minigraph,graphOpt,thread,outDir) 413 | mapAsm(asmInfo["nameList"],asmInfo["dotList"],asmInfo["suffix"],minigraph,thread,outDir) 414 | 415 | upDir = os.path.join(outDir,"upload") 416 | os.mkdir(upDir) 417 | 418 | nodeFile = os.path.join(upDir,"node.info") 419 | edgeFile = os.path.join(upDir,"edge.info") 420 | chrListFile = os.path.join(upDir,"chr.list") 421 | comChrFile = os.path.join(upDir,"complete.chr.list") 422 | asmListFile = os.path.join(upDir,"asm.list") 423 | ndAsmLFile = os.path.join(upDir,"node.asm.list") 424 | covFile = os.path.join(upDir,"cover.info") 425 | sepFile = os.path.join(upDir,"sep.info") 426 | pathDir = os.path.join(upDir,"path") 427 | os.mkdir(pathDir) 428 | 429 | graphDir = os.path.join(outDir,"newGraph") 430 | gfaFile = os.path.join(graphDir,"out.gfa") 431 | 432 | mapDir = os.path.join(outDir,"mapDir") 433 | mapListFile = os.path.join(mapDir,"gaf.list") 434 | 435 | wSep(sep,sepFile) 436 | reduceGFA(sep,gfaFile,nodeFile,edgeFile,comChrFile,ndAsmLFile) 437 | refAsm = refChrList(sep,nodeFile,chrListFile) 438 | if ncalCov: 439 | simpPath(nodeFile,mapListFile,minMQ,asmListFile,pathDir) 440 | else: 441 | nodeCov(nodeFile,mapListFile,minMQ,covFile,asmListFile,pathDir) 442 | # 443 | refGFA = os.path.join(upDir,"input.ref.gfa") 444 | mvCommand = "mv {} {}".format(gfaFile,refGFA) 445 | os.system(mvCommand) 446 | 447 | def fromGFA(ncalCov,sep,gfaFile,mapListFile,minMQ,outDir): 448 | upDir = os.path.join(outDir,"upload") 449 | os.mkdir(upDir) 450 | 451 | nodeFile = os.path.join(upDir,"node.info") 452 | edgeFile = os.path.join(upDir,"edge.info") 453 | chrListFile = os.path.join(upDir,"chr.list") 454 | comChrFile = os.path.join(upDir,"complete.chr.list") 455 | asmListFile = os.path.join(upDir,"asm.list") 456 | ndAsmLFile = os.path.join(upDir,"node.asm.list") 457 | covFile = os.path.join(upDir,"cover.info") 458 | sepFile = os.path.join(upDir,"sep.info") 459 | pathDir = os.path.join(upDir,"path") 460 | os.mkdir(pathDir) 461 | 462 | wSep(sep,sepFile) 463 | reduceGFA(sep,gfaFile,nodeFile,edgeFile,comChrFile,ndAsmLFile) 464 | refAsm = refChrList(sep,nodeFile,chrListFile) 465 | if mapListFile is not None: 466 | if ncalCov: 467 | simpPath(nodeFile,mapListFile,minMQ,asmListFile,pathDir) 468 | else: 469 | nodeCov(nodeFile,mapListFile,minMQ,covFile,asmListFile,pathDir) 470 | # 471 | refGFA = os.path.join(upDir,"input.ref.gfa") 472 | if gfaFile.endswith(".gz"): 473 | cpCommand = "zcat {} > {}".format(gfaFile,refGFA) 474 | else: 475 | cpCommand = "cp {} {}".format(gfaFile,refGFA) 476 | os.system(cpCommand) 477 | 478 | 479 | def indexGraph(outDir,nthread): 480 | upDir = os.path.join(outDir,"upload") 481 | mp = minipg.GraphRange(upDir,0) 482 | rangeSize = paras.range 483 | storeDep = paras.xDep 484 | ex = 1000000 485 | spChrFile = "00000000" 486 | mp.edgeWrite(spChrFile,rangeSize,ex,0,nthread,storeDep) 487 | 488 | def miniMain(): 489 | sep = paras.sep 490 | gfaFile = paras.rGFA 491 | mapListFile = paras.gafList 492 | minMQ = paras.minMQ 493 | outDir = paras.outDir 494 | asmList = paras.asmList 495 | minigraph = paras.minigraph 496 | graphOpt = paras.graphOpt 497 | thread = str(paras.thread) 498 | nthread = paras.thread 499 | ncalCov = paras.ncov 500 | 501 | checkOut(outDir) 502 | if gfaFile is not None: 503 | fromGFA(ncalCov,sep,gfaFile,mapListFile,minMQ,outDir) 504 | else: 505 | if asmList is not None and minigraph is not None: 506 | fromScratch(ncalCov,sep,asmList,minigraph,graphOpt,thread,minMQ,outDir) 507 | else: 508 | print("Error: lack of parameters!") 509 | exit(1) 510 | # 511 | if paras.index: 512 | indexGraph(outDir,nthread) 513 | 514 | if __name__ == '__main__': 515 | miniMain() 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | -------------------------------------------------------------------------------- /vrpg/views.py: -------------------------------------------------------------------------------- 1 | from django.shortcuts import render 2 | from django.http import JsonResponse 3 | from django.views.decorators.csrf import csrf_exempt 4 | 5 | import os 6 | import re 7 | import time 8 | # 9 | from module import minipg 10 | 11 | BinDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 12 | # 13 | BinDir = BinDir 14 | def index(request): 15 | allFiles = os.listdir(os.path.join(BinDir,"upload")) 16 | dDir = {"path","anno","mapping"} 17 | allDir = [i for i in allFiles if os.path.isdir(os.path.join(BinDir,"upload",i)) and i not in dDir] 18 | return render(request,"vrpg/index.html",{"folder":allDir}) 19 | 20 | def showInfo(request): 21 | return render(request,"vrpg/info.html",{}) 22 | 23 | def showManual(request): 24 | return render(request,"vrpg/manual.html",{}) 25 | 26 | def getSep(sepFile): 27 | with open(sepFile) as sf: 28 | line = sf.read().strip() 29 | return line 30 | 31 | def readChr(chrFile): 32 | nameList = [] 33 | lenList = [] 34 | with open(chrFile) as fh: 35 | for line in fh: 36 | if line.startswith("#"): 37 | continue 38 | line = line.strip() 39 | arr = line.split("\t") 40 | nameList.append(arr[0]) 41 | lenList.append(arr[2]) 42 | return {"nameList":nameList,"lenList":lenList} 43 | 44 | def readAsm(asmFile): 45 | asmList = [] 46 | with open(asmFile) as fh: 47 | for line in fh: 48 | if line.startswith("#"): 49 | continue 50 | line = line.strip() 51 | asmList.append(line) 52 | return asmList 53 | 54 | def qNodesCov(covFile,nodeVec,covNameFile,asm): 55 | asmPos = 0 56 | pos = 0 57 | with open(covNameFile) as af: 58 | for line in af: 59 | line = line.strip() 60 | if line == asm: 61 | asmPos = pos 62 | pos += 1 63 | 64 | nodeDict = {} 65 | nodeSort = sorted(set(nodeVec)) 66 | nodePos = 0 67 | num = len(nodeSort) 68 | cov = [] 69 | flag = False 70 | with open(covFile) as cf: 71 | for line in cf: 72 | line = line.strip() 73 | arr = line.split("\t") 74 | if nodeSort[nodePos] == int(arr[0]): 75 | flag = False 76 | if arr[1] != "*": 77 | oneArr = arr[1].split(",") 78 | if len(oneArr) < pos: 79 | for i in oneArr: 80 | if int(i) == asmPos: 81 | nodeDict[nodeSort[nodePos]] = 1.00 82 | flag = True 83 | break 84 | else: 85 | nodeDict[nodeSort[nodePos]] = oneArr[asmPos] 86 | flag = True 87 | 88 | if not flag: 89 | if len(arr) > 2: 90 | mDx = arr[2].split(",") 91 | mArr = arr[3].split(",") 92 | for k,x in zip(mDx,mArr): 93 | if int(k) == asmPos: 94 | nodeDict[nodeSort[nodePos]] = x 95 | flag = True 96 | break 97 | if not flag: 98 | print("Error: node {} may be not in the assembly!".format(nodeSort[nodePos])) 99 | nodeDict[nodeSort[nodePos]] = 0.00 100 | nodePos += 1 101 | if nodePos == num: 102 | break 103 | 104 | for nd in nodeVec: 105 | cov.append(nodeDict[nd]) 106 | return cov 107 | 108 | @csrf_exempt 109 | def showGraph(request): 110 | 111 | para = request.POST 112 | #print(para) 113 | species = para.get("species") 114 | upDir = "" 115 | if species == '0': 116 | upDir = os.path.join(BinDir,"upload") 117 | else: 118 | upDir = os.path.join(BinDir,"upload",species) 119 | 120 | bEdgeFile = os.path.join(upDir,"edge.bw") 121 | eIndexFile = os.path.join(upDir,"edge.bdx") 122 | rNdDxFile = os.path.join(upDir,"node.ref.bdx") 123 | rNdFile = os.path.join(upDir,"node.ref.bw") 124 | nrNdFile = os.path.join(upDir,"node.nonref.bw") 125 | mNdDxFile = os.path.join(upDir,"node.merge.bdx") 126 | 127 | formFile = os.path.join(upDir,"form.info") 128 | chrListFile = os.path.join(upDir,"load.chr.list") 129 | asmListFile = os.path.join(upDir,"asm.list") 130 | 131 | 132 | sChr = para.get("tchr") 133 | sStart = int(para.get("start")) 134 | sEnd = int(para.get("end")) 135 | buFilt = int(para.get("buFilt")) 136 | ex = 1000000 137 | wStart = 50 138 | wWidth = 800 139 | wCut = 2000 140 | y = 250 141 | 142 | asm = para.get("asm") 143 | vseq = para.get("vseq") 144 | if vseq == "1": 145 | taskID = para.get("taskID") 146 | asm = "!" + taskID 147 | chrList = {} 148 | if os.path.exists(chrListFile): 149 | chrList = readChr(chrListFile) 150 | else: 151 | chrListFile = os.path.join(upDir,"chr.list") 152 | chrList = readChr(chrListFile) 153 | 154 | asmList = {} 155 | if os.path.exists(asmListFile): 156 | asmList = readAsm(asmListFile) 157 | indexFlag = 0 158 | if os.path.exists(bEdgeFile) and os.path.exists(eIndexFile) and os.path.exists(rNdDxFile) and os.path.exists(rNdFile) and os.path.exists(nrNdFile) and os.path.exists(mNdDxFile): 159 | indexFlag = 1 160 | 161 | wsim = para.get("sim") 162 | sim = False 163 | refSim = False 164 | if wsim == "mnr": 165 | sim = True 166 | refSim = True 167 | elif wsim == "mr": 168 | #refSim = False 169 | sim = True 170 | queryDep = int(para.get("shdep")) 171 | 172 | ''' 173 | depFile = os.path.join(upDir,"index.dep") 174 | if os.path.exists(depFile): 175 | with open(depFile) as dh: 176 | queryDep = int(dh.read().strip()) 177 | ''' 178 | 179 | mp = minipg.GraphRange(upDir,indexFlag) 180 | mp.formatGraph(asm,sChr,sStart,sEnd,ex,wStart,wWidth,wCut,y,queryDep,buFilt,sim,refSim) 181 | 182 | draw_node = mp.draw_node 183 | draw_pos = mp.draw_pos 184 | # 185 | rNodeNum = len(draw_pos) 186 | 187 | layout = para.get("lay") 188 | 189 | if layout == "cosq" or layout == "coex": 190 | for i in range(rNodeNum ): 191 | draw_node[i]["fx"] = draw_pos[i]; 192 | draw_node[i]["fy"] = 20; 193 | else: 194 | for i in range(rNodeNum ): 195 | draw_node[i]["fx"] = draw_pos[i]; 196 | draw_node[i]["fy"] = y; 197 | 198 | draw_edge = mp.draw_edge 199 | neStart = rNodeNum - 1 200 | dnode_len = mp.dnode_len 201 | for k in dnode_len: 202 | draw_edge[neStart]["dis"] = k 203 | neStart += 1 204 | # 205 | hnCov = [] 206 | if vseq != '1' and asm != "" and ',' not in asm: 207 | if len(mp.hnGroup) > 0: 208 | nodeVec = [mp.nnames[x] for x in mp.hnGroup] 209 | covFile = os.path.join(upDir,"cover.info") 210 | covNameFile = os.path.join(upDir,"asm.list") 211 | if os.path.exists(covFile): 212 | hnCov = qNodesCov(covFile,nodeVec,covNameFile,asm) 213 | else: 214 | covFile = os.path.join(upDir,"cover.bw") 215 | if os.path.exists(covFile): 216 | mm = minipg.QueryNode(upDir) 217 | mm.queryAsmCov(nodeVec,asm) 218 | hnCov = mm.ndCov 219 | 220 | graphInfo = {'nodes':draw_node,'links':draw_edge, 'genome':mp.genome,'nnames':mp.nnames,'hnGroup':mp.hnGroup,'hLinks':mp.hLinks,'hDir':mp.hDir,'hnCov':hnCov,'nameList':chrList['nameList'],'lenList':chrList['lenList'],'asm':asmList, 221 | 'genePos':mp.ndGenePos,'geneVec':mp.geneVec,'layerVec':mp.layerVec,'strand':mp.strandVec,'mgFlagVec':mp.mgFlagVec,'figScale':mp.figScale,'tickValue':mp.tickValue,'tickPos':mp.tickPos,'ndExonPos':mp.ndExonPos,'rnaVec':mp.rnaVec, 222 | 'eLayerVec':mp.eLayerVec,'eStrandVec':mp.eStrandVec,'eNumVec':mp.eNumVec,'eFlagVec':mp.eFlagVec,'ndCDSPos':mp.ndCDSPos,'cdsVec':mp.cdsVec,'cLayerVec':mp.cLayerVec,'cNumVec':mp.cNumVec,'hEdgeAsm':mp.hEdgeAsm,'qChr':mp.qChr,'qStart':mp.qStart,'qEnd':mp.qEnd, 223 | 'qPath':mp.qPath,'qCigar':mp.qCigar,'tkNameVec':mp.tkNameVec,'tkDesVec':mp.tkDesVec,'tkColVec':mp.tkColVec,'tkCumVec':mp.tkCumVec,'tkItem':mp.tkItem,'rBedPos':mp.rBedPos,'rBedName':mp.rBedName,'rBedLayer':mp.rBedLayer, 224 | 'rBedScore':mp.rBedScore,'rBedStrand':mp.rBedStrand 225 | } 226 | #print(mp.rBedScore) 227 | return JsonResponse(graphInfo) 228 | 229 | @csrf_exempt 230 | def initGraph(request): 231 | para = request.POST 232 | species = para.get("species") 233 | upDir = "" 234 | if species == '0': 235 | upDir = os.path.join(BinDir,"upload") 236 | else: 237 | upDir = os.path.join(BinDir,"upload",species) 238 | 239 | 240 | bEdgeFile = os.path.join(upDir,"edge.bw") 241 | eIndexFile = os.path.join(upDir,"edge.bdx") 242 | rNdDxFile = os.path.join(upDir,"node.ref.bdx") 243 | rNdFile = os.path.join(upDir,"node.ref.bw") 244 | nrNdFile = os.path.join(upDir,"node.nonref.bw") 245 | mNdDxFile = os.path.join(upDir,"node.merge.bdx") 246 | 247 | formFile = os.path.join(upDir,"form.info") 248 | chrListFile = os.path.join(upDir,"load.chr.list") 249 | asmListFile = os.path.join(upDir,"asm.list") 250 | 251 | chrList = {} 252 | if os.path.exists(chrListFile): 253 | chrList = readChr(chrListFile) 254 | else: 255 | chrListFile = os.path.join(upDir,"chr.list") 256 | chrList = readChr(chrListFile) 257 | 258 | asmList = {} 259 | if os.path.exists(asmListFile): 260 | asmList = readAsm(asmListFile) 261 | 262 | sChr = chrList["nameList"][0] 263 | sStart = 1 264 | sEnd = 10000 265 | #if os.path.exists(formFile): 266 | # sEnd = 1000 267 | 268 | buFilt = 50 269 | ex = 1000000 270 | wStart = 50 271 | wWidth = 800 272 | wCut = 2000 273 | y = 250 274 | 275 | asm = "" 276 | indexFlag = 0 277 | if os.path.exists(bEdgeFile) and os.path.exists(eIndexFile) and os.path.exists(rNdDxFile) and os.path.exists(rNdFile) and os.path.exists(nrNdFile) and os.path.exists(mNdDxFile): 278 | indexFlag = 1 279 | 280 | sim = True 281 | refSim = False 282 | 283 | queryDep = 10 284 | ''' 285 | depFile = os.path.join(upDir,"index.dep") 286 | if os.path.exists(depFile): 287 | with open(depFile) as dh: 288 | queryDep = int(dh.read().strip()) 289 | ''' 290 | mp = minipg.GraphRange(upDir,indexFlag) 291 | mp.formatGraph(asm,sChr,sStart,sEnd,ex,wStart,wWidth,wCut,y,queryDep,buFilt,sim,refSim) 292 | draw_node = mp.draw_node 293 | draw_pos = mp.draw_pos 294 | # 295 | rNodeNum = len(draw_pos) 296 | 297 | for i in range(rNodeNum ): 298 | draw_node[i]["fx"] = draw_pos[i]; 299 | draw_node[i]["fy"] = y; 300 | draw_edge = mp.draw_edge 301 | neStart = rNodeNum - 1 302 | dnode_len = mp.dnode_len 303 | for k in dnode_len: 304 | draw_edge[neStart]["dis"] = k 305 | neStart += 1 306 | ###################### 307 | hnCov = [] 308 | 309 | graphInfo = {'nodes':draw_node,'links':draw_edge,'genome':mp.genome,'nnames':mp.nnames,'hnGroup':mp.hnGroup,'hLinks':mp.hLinks,'hDir':mp.hDir,'hnCov':hnCov,'nameList':chrList['nameList'],'lenList':chrList['lenList'],'asm':asmList, 310 | 'genePos':mp.ndGenePos,'geneVec':mp.geneVec,'layerVec':mp.layerVec,'strand':mp.strandVec,'mgFlagVec':mp.mgFlagVec,'figScale':mp.figScale,'tickValue':mp.tickValue,'tickPos':mp.tickPos,'ndExonPos':mp.ndExonPos,'rnaVec':mp.rnaVec, 311 | 'eLayerVec':mp.eLayerVec,'eStrandVec':mp.eStrandVec,'eNumVec':mp.eNumVec,'eFlagVec':mp.eFlagVec,'ndCDSPos':mp.ndCDSPos,'cdsVec':mp.cdsVec,'cLayerVec':mp.cLayerVec,'cNumVec':mp.cNumVec,'tkNameVec':mp.tkNameVec,'tkDesVec':mp.tkDesVec, 312 | 'tkColVec':mp.tkColVec,'tkCumVec':mp.tkCumVec,'tkItem':mp.tkItem,'rBedPos':mp.rBedPos,'rBedName':mp.rBedName,'rBedLayer':mp.rBedLayer,'rBedScore':mp.rBedScore,'rBedStrand':mp.rBedStrand 313 | } 314 | # 315 | return JsonResponse(graphInfo) 316 | 317 | @csrf_exempt 318 | def nodeGene(request): 319 | para = request.POST 320 | species = para.get("species") 321 | node = para.get('seg') 322 | dbDir = "" 323 | if species == '0': 324 | dbDir = os.path.join(BinDir,"upload") 325 | else: 326 | dbDir = os.path.join(BinDir,"upload",species) 327 | dbNodeFile = os.path.join(dbDir,"node.sort.bw") 328 | 329 | sepFile = os.path.join(dbDir,"sep.info") 330 | 331 | nodeAsm = '' 332 | nodeChr = '' 333 | nodeStart = '' 334 | nodeEnd = '' 335 | sep = getSep(sepFile) 336 | 337 | geneList = [] 338 | mp = minipg.QueryNode(dbDir) 339 | if os.path.exists(dbNodeFile): 340 | mp.queryDbNode(int(node)) 341 | nodeAsm = mp.nodeAsm 342 | nodeChr = mp.nodeChr 343 | nodeStart = mp.nodeStart 344 | nodeEnd = mp.nodeEnd 345 | else: 346 | nodeFile = os.path.join(dbDir,"node.info") 347 | with open(nodeFile) as nf: 348 | for line in nf: 349 | if line.startswith('#'): 350 | continue 351 | line = line.strip() 352 | arr = line.split('\t') 353 | if node == arr[0]: 354 | nodeAsmArr = arr[1].split(sep) 355 | nodeAsm = nodeAsmArr[0] + sep + nodeAsmArr[1] 356 | nodeChr = nodeAsmArr[-1] 357 | nodeStart = arr[2] 358 | nodeEnd = arr[3] 359 | #print(arr) 360 | break 361 | 362 | mp.queryGene(int(node),nodeAsm) 363 | annoList = mp.geneList 364 | for oneAnn in annoList: 365 | if oneAnn[1].endswith("gene"): 366 | geneList.append(oneAnn) 367 | 368 | return JsonResponse({'nodeAsm':nodeAsm,'nodeChr':nodeChr,'nodeStart':nodeStart,'nodeEnd':nodeEnd,'geneList':geneList}) 369 | 370 | @csrf_exempt 371 | def searchNode(request): 372 | para = request.POST 373 | species = para.get("species") 374 | node = para.get('seg') 375 | # 376 | dbDir = "" 377 | if species == '0': 378 | dbDir = os.path.join(BinDir,"upload") 379 | else: 380 | dbDir = os.path.join(BinDir,"upload",species) 381 | dbNodeFile = os.path.join(dbDir,"node.sort.bw") 382 | dbCovFile = os.path.join(dbDir,"cover.bw") 383 | asmListFile = os.path.join(dbDir,"asm.list") 384 | 385 | # 386 | sepFile = os.path.join(dbDir,"sep.info") 387 | 388 | nodeAsm = '' 389 | nodeChr = '' 390 | nodeStart = '' 391 | nodeEnd = '' 392 | nodeSeq = '' 393 | header = [] 394 | sep = getSep(sepFile) 395 | 396 | # 397 | with open(asmListFile) as af: 398 | for line in af: 399 | header.append(line.strip()) 400 | 401 | geneList = [[]] 402 | if os.path.exists(dbNodeFile): 403 | mp = minipg.QueryNode(dbDir) 404 | mp.queryDbNode(int(node)) 405 | 406 | nodeAsm = mp.nodeAsm 407 | nodeChr = mp.nodeChr 408 | nodeStart = mp.nodeStart 409 | nodeEnd = mp.nodeEnd 410 | 411 | mp.queryGene(int(node),nodeAsm) 412 | geneList = mp.geneList 413 | # 414 | mp.fetchNdSeq(int(node)) 415 | nodeSeq = mp.nodeSeq 416 | else: 417 | nodeFile = os.path.join(dbDir,"node.info") 418 | with open(nodeFile) as nf: 419 | for line in nf: 420 | if line.startswith('#'): 421 | continue 422 | line = line.strip() 423 | arr = line.split('\t') 424 | if node == arr[0]: 425 | nodeAsmArr = arr[1].split(sep) 426 | nodeAsm = nodeAsmArr[0] + sep + nodeAsmArr[1] 427 | nodeChr = nodeAsmArr[-1] 428 | nodeStart = arr[2] 429 | nodeEnd = arr[3] 430 | #print(arr) 431 | break 432 | mp = minipg.QueryNode(dbDir) 433 | mp.queryGene(int(node),nodeAsm) 434 | geneList = mp.geneList 435 | 436 | 437 | cov = [] 438 | if os.path.exists(dbCovFile): 439 | mp = minipg.QueryNode(dbDir) 440 | mp.queryDbCov(int(node)) 441 | cov = mp.ndCov 442 | else: 443 | covFile = os.path.join(dbDir,"cover.info") 444 | if os.path.exists(covFile): 445 | cov = [0 for i in header] 446 | with open(covFile) as cf: 447 | for line in cf: 448 | line = line.strip() 449 | arr = line.split("\t") 450 | if node == arr[0]: 451 | if arr[1] != "*": 452 | # 453 | oneArr = arr[1].split(",") 454 | if len(oneArr) < len(header): 455 | for i in oneArr: 456 | cov[int(i)] = 1.00; 457 | else: 458 | cov = oneArr 459 | break 460 | 461 | if len(arr) > 2: 462 | mDx = arr[2].split(",") 463 | mArr = arr[3].split(",") 464 | for k,x in zip(mDx,mArr): 465 | cov[int(k)] = x 466 | break 467 | 468 | 469 | return JsonResponse({'asm':header,'cov':cov,'nodeAsm':nodeAsm,'nodeChr':nodeChr,'nodeStart':nodeStart,'nodeEnd':nodeEnd,'geneList':geneList,'nodeSeq':nodeSeq}) 470 | 471 | def taskQuery(species): 472 | tkNum = 0 473 | upDir = os.path.join(BinDir,"upload") 474 | mDir = os.path.join(upDir,"mapping") 475 | if os.path.exists(mDir): 476 | taskList = os.listdir(mDir) 477 | for tdir in taskList: 478 | tkFile = os.path.join(mDir,tdir,"task.info") 479 | if not os.path.exists(tkFile): 480 | tkNum += 1 481 | 482 | dirList = os.listdir(upDir) 483 | for dir in dirList: 484 | if dir != "mapping": 485 | dirPath = os.path.join(upDir,dir) 486 | if os.path.isdir(dirPath): 487 | mDir = os.path.join(dirPath,"mapping") 488 | if os.path.exists(mDir): 489 | taskList = os.listdir(mDir) 490 | for tdir in taskList: 491 | tkFile = os.path.join(mDir,tdir,"task.info") 492 | if not os.path.exists(tkFile): 493 | tkNum += 1 494 | 495 | return tkNum 496 | 497 | def createTask(species): 498 | preDir = "" 499 | if species == "0": 500 | preDir = os.path.join(BinDir,"upload","mapping") 501 | else: 502 | preDir = os.path.join(BinDir,"upload",species,"mapping") 503 | if not os.path.exists(preDir): 504 | os.mkdir(preDir,0o755) 505 | 506 | taskID = "task_" + str(time.time()) + "_1" 507 | taskDir = os.path.join(preDir,taskID) 508 | num = 1 509 | while os.path.exists(taskDir): 510 | num += 1 511 | taskID = "task_" + str(time.time()) + "_" + str(num) 512 | taskDir = os.path.join(preDir,taskID) 513 | os.mkdir(taskDir,0o755) 514 | return taskID 515 | 516 | # 517 | def wQuerySeq(qSeq,taskDir): 518 | queryFile = os.path.join(taskDir,"query.fa") 519 | with open(queryFile,'w') as fh: 520 | fh.write(qSeq) 521 | 522 | def queryMap(minigraph,gfaFile,taskDir): 523 | queryFile = os.path.join(taskDir,"query.fa") 524 | outFile = os.path.join(taskDir,"query.gaf") 525 | # 526 | mType = " lr " 527 | secLine = "" 528 | with open(queryFile) as fh: 529 | firLine = fh.readline() 530 | if not firLine.startswith('>'): 531 | print("Error: query format error (not FASTA format)!") 532 | return 533 | try: 534 | secLine = fh.read() 535 | except: 536 | print("Error: query format error (not FASTA format)!") 537 | return 538 | 539 | if secLine != "": 540 | if len(secLine) < 200: 541 | 542 | mType = " sr " 543 | command = minigraph + " -cx" + mType + "--vc " + gfaFile + " " + queryFile + " -o " + outFile 544 | os.system(command) 545 | 546 | def map2path(taskDir): 547 | gafFile = os.path.join(taskDir,"query.gaf") 548 | pathFile = os.path.join(taskDir,"query.path") 549 | 550 | with open(gafFile) as gf,open(pathFile,'w') as pf: 551 | pathDict = {} 552 | for mapinfo in gf: 553 | mapinfo = mapinfo.strip() 554 | mapArr = mapinfo.split("\t") 555 | # 556 | mapStr = mapArr[5].replace("s","") 557 | if mapArr[0] not in pathDict: 558 | pathDict[mapArr[0]] = {} 559 | pathDict[mapArr[0]][int(mapArr[2])] = mapStr + "\t" + mapArr[2] + "\t" + mapArr[7] + "\t" + mapArr[18].split(":")[2] 560 | 561 | for k in pathDict: 562 | sortPath = sorted(pathDict[k].items(),key=lambda x : x[0]) 563 | for tpath in sortPath: 564 | pf.write(k+"\t"+tpath[1]+"\n") 565 | 566 | 567 | def map2Loci(preDir,gaf2rbed,edgeFile,taskDir): 568 | pathFile = os.path.join(taskDir,"query.path") 569 | outFile = os.path.join(taskDir,"query.bed") 570 | 571 | rChrFile = os.path.join(preDir,"chr.list") 572 | rnodeFile = os.path.join(preDir,"node.ref.bw") 573 | dxFile = os.path.join(preDir,"node.ref.bdx") 574 | edgeFile = os.path.join(preDir,"edge.info") 575 | 576 | command = gaf2rbed + " --chr " + rChrFile + " --rnode " + rnodeFile + " --dxnode " + dxFile + " --edge " + edgeFile + " --path " + pathFile + " --out " + outFile 577 | os.system(command) 578 | 579 | def readQyLoci(taskDir): 580 | bedFile = os.path.join(taskDir,"query.bed") 581 | locInfo = [] 582 | with open(bedFile) as fh: 583 | for line in fh: 584 | line = line.strip() 585 | arr = line.split("\t") 586 | locInfo.append(arr) 587 | return locInfo 588 | 589 | @csrf_exempt 590 | def seqQuery(request): 591 | para = request.POST 592 | querySeq = para.get("qSeq") 593 | species = para.get("species") 594 | 595 | upDir = os.path.join(BinDir,"upload") 596 | preDir = upDir 597 | if species != "0": 598 | preDir = os.path.join(upDir,species) 599 | 600 | gfaFile = "" 601 | edgeFile = "" 602 | taskDir = "" 603 | gfaFile = os.path.join(preDir,"input.ref.gfa") 604 | if not os.path.exists(gfaFile): 605 | return JsonResponse({"tkNum": -1}) 606 | 607 | tkNum = taskQuery(species) 608 | if tkNum > 2: 609 | return JsonResponse({"tkNum": tkNum}) 610 | taskID = createTask(species) 611 | 612 | edgeFile = os.path.join(preDir,"edge.info") 613 | taskDir = os.path.join(preDir,"mapping",taskID) 614 | wQuerySeq(querySeq,taskDir) 615 | minigraph = os.path.join(BinDir,"bin","minigraph") 616 | 617 | queryMap(minigraph,gfaFile,taskDir) 618 | map2path(taskDir) 619 | gaf2rbed = os.path.join(BinDir,"module","gaf2rbed") 620 | #gaf2rbed = os.path.join(BinDir,"bin","gaf2rbed") 621 | map2Loci(preDir,gaf2rbed,edgeFile,taskDir) 622 | #return taskDir 623 | locInfo = readQyLoci(taskDir) 624 | # 625 | tkFile = os.path.join(taskDir,"task.info") 626 | ofh = open(tkFile,'w') 627 | ofh.close() 628 | return JsonResponse({"taskID":taskID,"locInfo":locInfo,"tkNum":1}) 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | -------------------------------------------------------------------------------- /module/anno.cpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | //#include 9 | //#include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "gz.h" 15 | #include "vgraph.h" 16 | 17 | using namespace std; 18 | /* 19 | typedef struct{ 20 | char seqid[FIELDSIZE]; 21 | int start; 22 | int end; 23 | char geneID[FIELDSIZE]; 24 | char geneName[FIELDSIZE]; 25 | char strand; 26 | } AnnoLine; 27 | 28 | typedef struct{ 29 | long long offset; 30 | int num; 31 | } AnnoDx; 32 | */ 33 | //---------------------- 34 | typedef struct{ 35 | string seqid; 36 | int start; 37 | int end; 38 | string geneID; 39 | string geneName; 40 | string strand; 41 | } SimAnno; 42 | typedef struct{ 43 | int start; 44 | int end; 45 | } NdPos; 46 | 47 | // NdPos ChrAnnoPos 48 | 49 | bool byPos(SimAnno &a,SimAnno &b){ 50 | if(a.seqid < b.seqid){ 51 | return true; 52 | } 53 | if(a.seqid > b.seqid){ 54 | return false; 55 | } 56 | 57 | if(a.start < b.start){ 58 | return true; 59 | } 60 | if(a.start > b.start){ 61 | return false; 62 | } 63 | 64 | if(a.end < b.end){ 65 | return true; 66 | } 67 | return false; 68 | } 69 | 70 | string getSep(string &sepFile){ 71 | ifstream in(sepFile.c_str()); 72 | if(! in){ 73 | cerr<<"Error: file open failed. "<> sep; 78 | in.close(); 79 | return sep; 80 | } 81 | 82 | // asmChrPos 83 | void getAsmNode(string &nodeFile,string &sep,map > > &nodeChrPos,string &refStr){ 84 | ifstream in(nodeFile.c_str()); 85 | string line; 86 | string node,r_chr; 87 | int start,end,len,refOr; 88 | stringstream strStream; 89 | string preAsm = ""; 90 | string preChr = ""; 91 | bool isRef = false; 92 | getline(in,line); 93 | while(getline(in,line)){ 94 | strStream << line; 95 | strStream >> node; 96 | strStream >> r_chr; 97 | strStream >> start; 98 | strStream >> end; 99 | strStream >> len; 100 | strStream >> refOr; 101 | 102 | strStream.clear(); 103 | strStream.str(""); 104 | // 105 | string tName,t_hap,tchr; 106 | asmSplit(r_chr,sep,tName,t_hap,tchr); 107 | string asmStr = tName + sep + t_hap; 108 | 109 | if(asmStr != preAsm){ 110 | if(refOr == 0){ 111 | refStr = asmStr; 112 | isRef = true; 113 | }else{ 114 | isRef = false; 115 | } 116 | // 117 | if(! isRef){ 118 | map > tChrPos; 119 | vector tvec; 120 | NdPos tndpos{start,end}; 121 | tvec.push_back(tndpos); 122 | tChrPos.emplace(tchr,tvec); 123 | nodeChrPos.emplace(asmStr,tChrPos); 124 | } 125 | }else{ 126 | if(! isRef){ 127 | NdPos tndpos{start,end}; 128 | if(tchr != preChr){ 129 | vector tvec; 130 | tvec.push_back(tndpos); 131 | nodeChrPos[asmStr].emplace(tchr,tvec); 132 | }else{ 133 | nodeChrPos[asmStr][tchr].push_back(tndpos); 134 | } 135 | } 136 | } 137 | // 138 | preChr = tchr; 139 | preAsm = asmStr; 140 | } 141 | in.close(); 142 | } 143 | 144 | void rgSearch(vector &chrNode,vector &asmAnnoVec,int usize,size_t searchStart,size_t searchEnd,ofstream &out,int &total){ 145 | size_t sPos = 0; 146 | //bool flag = true; 147 | for(size_t i = searchStart; i < searchEnd; ++i){ 148 | size_t tPos = sPos; 149 | for(size_t s = sPos; s < chrNode.size(); ++s){ 150 | if(asmAnnoVec[i].end < chrNode[s].start){ 151 | tPos = s; 152 | break; 153 | }else{ 154 | if(asmAnnoVec[i].start <= chrNode[s].end){ 155 | AnnoLine anno; 156 | anno.seqid[FIELDSIZE-1] = '\0'; 157 | anno.geneID[FIELDSIZE-1] = '\0'; 158 | anno.geneName[FIELDSIZE-1] = '\0'; 159 | 160 | strncpy(anno.seqid,asmAnnoVec[i].seqid.c_str(),FIELDSIZE-1); 161 | strncpy(anno.geneID,asmAnnoVec[i].geneID.c_str(),FIELDSIZE-1); 162 | strncpy(anno.geneName,asmAnnoVec[i].geneName.c_str(),FIELDSIZE-1); 163 | anno.strand = asmAnnoVec[i].strand[0]; 164 | anno.start = asmAnnoVec[i].start; 165 | anno.end = asmAnnoVec[i].end; 166 | 167 | out.write((char *)&anno,usize); 168 | 169 | ++total; 170 | // 171 | tPos = s; 172 | break; 173 | } 174 | } 175 | } 176 | sPos = tPos; 177 | } 178 | } 179 | 180 | void simpGFF(string &gffFile,string &chrNameFile,map > &asmChrPos,bool isRef,string &outFile){ 181 | igzstream in(gffFile.c_str()); 182 | ofstream out(outFile.c_str()); 183 | if(! in){ 184 | cerr<<"Error: file open failed. "< chrMap; 197 | 198 | bool flag = false; 199 | if(chrNameFile != "NA"){ 200 | ifstream rn(chrNameFile.c_str()); 201 | if(! rn){ 202 | cerr<<"Error: file open failed. "<> pName; 208 | strStream >> gName; 209 | chrMap.emplace(pName,gName); 210 | strStream.clear(); 211 | strStream.str(""); 212 | } 213 | rn.close(); 214 | // 215 | flag = true; 216 | } 217 | // 218 | regex pat0("\\t"); 219 | regex pat1("ID=(.+?)(;|$)"); 220 | regex pat2("Name=(.+?)(;|$)"); 221 | regex pat3("gene$"); 222 | 223 | 224 | string preSeqID = ""; 225 | string gSeqID = ""; 226 | string preGid = ""; 227 | int usize = sizeof(AnnoLine); 228 | string seqID,type,strand,attr,geneID,geneName; 229 | 230 | bool chrFind = false; 231 | 232 | vector asmAnnoVec; 233 | //map > allGene; 234 | 235 | int total = 0; 236 | out.write((char *)&total,sizeof(int)); 237 | // 238 | while(getline(in,line)){ 239 | if(line[0] != '#'){ 240 | sregex_token_iterator pos(line.begin(),line.end(),pat0,-1); 241 | sregex_token_iterator pend; 242 | int i = 0; 243 | int start = 0,end = 0; 244 | while(pos != pend){ 245 | switch(i){ 246 | case 0: 247 | seqID = *pos; 248 | break; 249 | case 2: 250 | type = *pos; 251 | break; 252 | case 3: 253 | start = atoi((*pos).str().c_str()); 254 | break; 255 | case 4: 256 | end = atoi((*pos).str().c_str()); 257 | break; 258 | case 6: 259 | strand = *pos; 260 | break; 261 | case 8: 262 | attr = *pos; 263 | } 264 | // 265 | ++pos; 266 | ++i; 267 | } 268 | // 269 | if(start < 1 || end < 1){ 270 | cerr<<"Warning: failed to get feature position. "< 0 && gSeqID != preGid){ 349 | searchEnd = k; 350 | // 351 | vector &chrNode = asmChrPos[preGid]; 352 | rgSearch(chrNode,asmAnnoVec,usize,searchStart,searchEnd,out,total); 353 | // 354 | searchStart = k; 355 | } 356 | // 357 | preGid = gSeqID; 358 | } 359 | // 360 | searchEnd = asmAnnoVec.size(); 361 | if(searchEnd > 0){ 362 | vector &chrNode = asmChrPos[preGid]; 363 | rgSearch(chrNode,asmAnnoVec,usize,searchStart,searchEnd,out,total); 364 | } 365 | } 366 | // 367 | out.seekp(0,ios::beg); 368 | out.write((char *)&total,sizeof(int)); 369 | // 370 | in.close(); 371 | out.close(); 372 | } 373 | 374 | void simAllGFF(const char *asmListFile,const char *gffListFile,string &nodeFile,string &sep,string &annoDir){ 375 | ifstream af(asmListFile); 376 | ifstream gf(gffListFile); 377 | 378 | string line; 379 | stringstream strStream; 380 | int i = 0; 381 | map asmMap; 382 | while(getline(af,line)){ 383 | asmMap.emplace(line,i); 384 | ++i; 385 | } 386 | 387 | map > > nodeChrPos; 388 | string refStr = ""; 389 | getAsmNode(nodeFile,sep,nodeChrPos,refStr); 390 | 391 | string asmb,gffFile,chrNameFile; 392 | while(getline(gf,line)){ 393 | strStream << line; 394 | 395 | strStream >> asmb; 396 | strStream >> gffFile; 397 | strStream >> chrNameFile; 398 | 399 | string outFile = annoDir + "/" + to_string(asmMap[asmb]) + ".anno.bw"; 400 | cout<<"Process -- "< > asmChrPos; 404 | bool isRef = true; 405 | simpGFF(gffFile,chrNameFile,asmChrPos,isRef,outFile); 406 | }else{ 407 | if(nodeChrPos.find(asmb) == nodeChrPos.end()){ 408 | cout<<"Warning: "< > &asmChrPos = nodeChrPos[asmb]; 412 | simpGFF(gffFile,chrNameFile,asmChrPos,isRef,outFile); 413 | } 414 | } 415 | strStream.clear(); 416 | strStream.str(""); 417 | } 418 | // 419 | af.close(); 420 | gf.close(); 421 | 422 | } 423 | 424 | // anno.bdx, anno.bw 425 | 426 | int getMax(const char *nodeFile){ 427 | ifstream nfh(nodeFile); 428 | stringstream strStream; 429 | string line; 430 | getline(nfh,line); 431 | int node; 432 | int i = 0; 433 | int maxNode = 0; 434 | while(getline(nfh,line)){ 435 | strStream << line; 436 | strStream >> node; 437 | strStream.clear(); 438 | strStream.str(""); 439 | // 440 | if(i == 0){ 441 | maxNode = node; 442 | }else{ 443 | if(node > maxNode){ 444 | maxNode = node; 445 | } 446 | } 447 | ++i; 448 | } 449 | nfh.close(); 450 | return maxNode; 451 | } 452 | 453 | void findGene(const char *nodeFile,const char *asmListFile,string &annoDir,string &sep,const char *bdxFile,const char *numFile){ 454 | ifstream af(asmListFile); 455 | ifstream nfh(nodeFile); 456 | ofstream bdx(bdxFile); 457 | ofstream mfh(numFile); 458 | 459 | int i = 0; 460 | map asmMap; 461 | string line; 462 | while(getline(af,line)){ 463 | asmMap.emplace(line,i); 464 | ++i; 465 | } 466 | af.close(); 467 | // 468 | stringstream strStream; 469 | getline(nfh,line); 470 | string r_chr; 471 | int node,start,end; 472 | string preSeq = ""; 473 | AnnoLine *asmAnno = nullptr; 474 | bool flag = false; 475 | int maxNode = getMax(nodeFile); 476 | AnnoDx *dxArr = new AnnoDx[maxNode]; 477 | //AnnoDx zero = {0LL,0}; 478 | long long offset = 0LL; 479 | int intSize = sizeof(int); 480 | int uSize = sizeof(AnnoDx); 481 | string tName = "",t_hap = "",tchr = ""; 482 | int nline = 0; 483 | //int pSearchStart = 0; 484 | int searchStart = 0,searchEnd = 0,moveStart = 0; 485 | 486 | map sChrMap; 487 | string p_tchr = ""; 488 | bool asmFind = false; 489 | while(getline(nfh,line)){ 490 | strStream << line; 491 | strStream >> node; 492 | strStream >> r_chr; 493 | strStream >> start; 494 | strStream >> end; 495 | strStream.clear(); 496 | strStream.str(""); 497 | // 498 | tName = "",t_hap = "",tchr = ""; 499 | asmSplit(r_chr,sep,tName,t_hap,tchr); 500 | string asmStr = tName + sep + t_hap; 501 | if(asmStr != preSeq){ 502 | if(flag){ 503 | delete []asmAnno; 504 | flag = false; 505 | sChrMap.clear(); 506 | } 507 | // 508 | string annoFile = annoDir + "/" + to_string(asmMap[asmStr]) + ".anno.bw"; 509 | ifstream in(annoFile.c_str()); 510 | if(! in){ 511 | cerr<<"Warning: Annotation of "< 0){ 529 | if(sChrMap.find(preChr) == sChrMap.end()){ 530 | NdPos txpos = {tstart,k-1}; 531 | sChrMap.emplace(preChr,txpos); 532 | } 533 | //else cerr<<"Error: items in GFF was not ordered by chromosome/scaffold."< posVec; 552 | if(asmFind){ 553 | bool fir = true; 554 | bool chrFind = true; 555 | if(tchr != p_tchr){ 556 | if(sChrMap.find(tchr) != sChrMap.end()){ 557 | searchStart = sChrMap[tchr].start; 558 | searchEnd = sChrMap[tchr].end; 559 | chrFind = true; 560 | moveStart = searchStart; 561 | }else{ 562 | chrFind = false; 563 | } 564 | }else{ 565 | if(chrFind){ 566 | searchStart = moveStart; 567 | } 568 | } 569 | //for(int i = pSearchStart; i < nline; ++i){ 570 | if(chrFind){ 571 | for(int i = searchStart; i <= searchEnd; ++i){ 572 | //if(strcmp(asmAnno[i].seqid,tchr.c_str()) == 0){ 573 | if(asmAnno[i].end >= start){ 574 | if(asmAnno[i].start <= end){ 575 | posVec.push_back(i); 576 | if(fir){ 577 | moveStart = i; 578 | fir = false; 579 | } 580 | }else{ 581 | break; 582 | } 583 | } 584 | //}else{ 585 | // break; 586 | //} 587 | } 588 | } 589 | } 590 | // pSearchStart = searchStart; 591 | for(auto tpos : posVec){ 592 | mfh.write((char *)&tpos,intSize); 593 | } 594 | 595 | int tnum = posVec.size(); 596 | //AnnoDx tdx = {offset,tnum}; 597 | dxArr[node-1].offset = offset; 598 | dxArr[node-1].num = tnum; 599 | offset += posVec.size() * intSize; 600 | // 601 | preSeq = asmStr; 602 | p_tchr = tchr; 603 | } 604 | bdx.write((char *)dxArr,uSize*maxNode); 605 | delete []dxArr; 606 | if(flag){ 607 | delete []asmAnno; 608 | } 609 | // 610 | nfh.close(); 611 | mfh.close(); 612 | bdx.close(); 613 | } 614 | 615 | 616 | void ndg_usage(){ 617 | cout<<"Usage: GraphAnno nodeGene --gffList --upDir "< Input file containing three columns separated by whitespace(assemlby name, absolute file path of GFF file, absolute file path of translation table file)." 619 | "Translation table is used to make the chromosome/contig names in GFF file matching that in graph file. This file contain two columns " 620 | "separated by whitespace (chromosome/contig name in GFF file, chromosome/contig name in graph file). If chromosome name transformation " 621 | "is not needed the file path of the translation table can be set to NA in the gffList file."< 'upload' directory which including files generated by 'gfa2view' or 'vrpg_preprocess.py'."< 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "vgraph.h" 9 | //#include "gz.h" 10 | 11 | using namespace std; 12 | 13 | typedef struct{ 14 | int start; 15 | int end; 16 | 17 | string name; 18 | float score; 19 | char strand; 20 | } BedInfo; 21 | 22 | // BedInfo,SimBed 23 | template 24 | bool bedByPos(T &a, T &b){ 25 | if(a.start < b.start){ 26 | return true; 27 | } 28 | if(a.start == b.start){ 29 | if(a.end < b.end){ 30 | return true; 31 | } 32 | } 33 | return false; 34 | } 35 | 36 | typedef struct{ 37 | int start; 38 | int end; 39 | string name; 40 | float score; 41 | char strand; 42 | uint16_t layer; 43 | } SimBed; 44 | 45 | //By default, features with layer > 50 for each track will not be output. Change the default behaviour by '--layer' 46 | int getTrackLayer(int kLayer,int layStart,vector &chrVec,map > &allChrGene,ofstream &out){ 47 | int gLayMax = 1; 48 | for(string &tchr : chrVec ){ 49 | vector &chrGene = allChrGene[tchr]; 50 | sort(chrGene.begin(),chrGene.end(),bedByPos); 51 | // 52 | int maxLayer = 1; 53 | size_t sPos = 0; 54 | bool overlap = false; 55 | vector layVec; 56 | for(size_t k = 0; k < chrGene.size(); ++k){ 57 | // Does the focused gene have an overlap with previous genes ? 58 | // no overp; new start 59 | vector layerSta(maxLayer,0); 60 | overlap = false; 61 | for(size_t m = sPos; m < k; ++m){ 62 | //chrGene[k].start 63 | //chrGene[k].end 64 | if(chrGene[k].start <= chrGene[m].end && chrGene[k].end >= chrGene[m].start){ 65 | layerSta[layVec[m - sPos] - 1] = 1; 66 | overlap = true; 67 | } 68 | } 69 | if(overlap){ 70 | bool fir = true; 71 | for(int n = 0; n < maxLayer; ++n){ 72 | //if(fir){ 73 | if(layerSta[n] == 0){ 74 | layVec.push_back(n+1); 75 | fir = false; 76 | break; 77 | } 78 | //} 79 | } 80 | if(fir){ 81 | ++maxLayer; 82 | layVec.push_back(maxLayer); 83 | if(maxLayer > gLayMax){ 84 | gLayMax = maxLayer; 85 | } 86 | } 87 | }else{ 88 | // pile up genes 89 | // ------- --------- 90 | // --------------------- 91 | for(size_t j = sPos; j < k; ++j){ 92 | int tlayer = layVec[j - sPos] + layStart; 93 | if(tlayer <= kLayer){ 94 | out< kLayer){ 117 | gLayMax = kLayer; 118 | } 119 | return gLayMax; 120 | } 121 | 122 | void setTrackLayer(int layStart,vector &chrVec,map > &allChrGene,ofstream &out){ 123 | int tlayer = layStart + 1; 124 | for(string &tchr : chrVec ){ 125 | vector &chrGene = allChrGene[tchr]; 126 | for(size_t k = 0; k < chrGene.size(); ++k){ 127 | out< > allChrGene; 164 | vector chrVec; 165 | string trackName = "*"; 166 | string trackDes = "*"; 167 | int tnum = 0; 168 | int cumLayMax = 0; 169 | while(getline(in,line)){ 170 | if(line[0] == '#'){ 171 | continue; 172 | } 173 | if(regex_search(line,pat1)){ 174 | if(trackNum > 0){ 175 | int layStart = cumLayMax; 176 | if(tnum > 4){ 177 | setTrackLayer(layStart,chrVec,allChrGene,out); 178 | cumLayMax += 1; 179 | }else{ 180 | cumLayMax += getTrackLayer(kLayer,layStart,chrVec,allChrGene,out); 181 | } 182 | 183 | allChrGene.clear(); 184 | chrVec.clear(); 185 | //#nColumns layer track_name description 186 | kfh< vec; 207 | while(pos != pend){ 208 | vec.push_back(pos->str()); 209 | ++pos; 210 | } 211 | // 212 | tnum = vec.size(); 213 | if(tnum < 3){ 214 | cerr<<"Error: bed format error. The total number of columns should be >= 3. "< chromStart. "< tvec; 257 | tvec.push_back(tgene); 258 | allChrGene.emplace(vec[0],tvec); 259 | // 260 | chrVec.push_back(vec[0]); 261 | } 262 | }else{ 263 | allChrGene[vec[0]].push_back(tgene); 264 | } 265 | // 266 | preSeqID = vec[0]; 267 | } 268 | } 269 | // 270 | if(trackNum > 0){ 271 | int layStart = cumLayMax; 272 | if(tnum > 4){ 273 | setTrackLayer(layStart,chrVec,allChrGene,out); 274 | cumLayMax += 1; 275 | }else{ 276 | cumLayMax += getTrackLayer(kLayer,layStart,chrVec,allChrGene,out); 277 | } 278 | //#nColumns layer track_name description 279 | kfh< &refChrMap,map > &allGeneLay){ 291 | ifstream gf(geneFile.c_str()); 292 | if(! gf){ 293 | cerr<<"Error: file open failed. "< vec; 313 | vec.reserve(7); 314 | while(pos != pend){ 315 | vec.push_back(pos->str()); 316 | ++pos; 317 | } 318 | tchr = vec[0]; 319 | start = atoi(vec[1].c_str()); 320 | end = atoi(vec[2].c_str()); 321 | name = vec[3]; 322 | score = atof(vec[4].c_str()); 323 | strand = vec[5][0]; 324 | layer = atoi(vec[6].c_str()); 325 | 326 | if(layer > 65535){ 327 | cerr<<"Warning: layer > 65535. Too many RNA or too many overlapping genes. The item ("< tvec; 347 | tvec.push_back(tsim); 348 | allGeneLay.emplace(chrpos,tvec); 349 | } 350 | }else{ 351 | flag = false; 352 | } 353 | }else{ 354 | if(flag){ 355 | allGeneLay[chrpos].push_back(tsim); 356 | } 357 | } 358 | preChr = tchr; 359 | } 360 | gf.close(); 361 | // 362 | int tkNum = 0; 363 | ifstream kfh(tkDesFile.c_str()); 364 | while(getline(kfh,line)){ 365 | tkNum++; 366 | } 367 | kfh.close(); 368 | // 369 | if(tkNum > 1){ 370 | for(auto &tm : allGeneLay){ 371 | sort(tm.second.begin(),tm.second.end(),bedByPos); 372 | } 373 | } 374 | } 375 | 376 | void getBdDxRef(string &chrListFile,map &refChrMap){ 377 | ifstream in(chrListFile.c_str()); 378 | int i = 0; 379 | string chrLine; 380 | while(getline(in,chrLine)){ 381 | int tpos = chrLine.find("\t"); 382 | string tchr = chrLine.substr(0,tpos); 383 | refChrMap.emplace(tchr,i); 384 | ++i; 385 | } 386 | in.close(); 387 | if(refChrMap.empty()){ 388 | cerr<<"Error: file is empty. "< refChrMap; 406 | getBdDxRef(chrListFile,refChrMap); 407 | // 408 | map > allGeneLay; 409 | getGeneMap(tkDesFile,geneFile,refChrMap,allGeneLay); 410 | //--------------------------------------------- 411 | ifstream rxfh(rndDxFile.c_str()); 412 | if(! rxfh){ 413 | cerr<<"Error: file open failed. "< allchr; 428 | allchr.reserve(nchr); 429 | map chrRanMap; 430 | for(int t = 0; t < nchr; ++t){ 431 | int tchr; 432 | ChrRange cRange; 433 | rxfh.read((char *)&tchr,intSize); 434 | rxfh.read((char *)&cRange,crSize); 435 | 436 | odx.write((char *)&tchr,intSize); 437 | odx.write((char *)&cRange,crSize); 438 | 439 | allchr.push_back(tchr); 440 | chrRanMap.emplace(tchr,cRange); 441 | } 442 | // 443 | int oneSize = sizeof(OneRange); 444 | //int llSize = sizeof(long long); 445 | //int dxByte = intSize + (intSize + crSize) * refChrMap.size(); 446 | long long ndByte = 0,ndUnit = sizeof(BedNode); 447 | for(int xchr : allchr){ 448 | ChrRange cRange = chrRanMap[xchr]; 449 | // 450 | vector acrVec; 451 | acrVec.reserve(cRange.ranNum); 452 | int chrNdNum = 0; 453 | for(int k = 0; k < cRange.ranNum; ++k){ 454 | OneRange aRange; 455 | rxfh.read((char *)&aRange,oneSize); 456 | RanPos tpos = {aRange.ranStart,aRange.ranEnd}; 457 | acrVec.push_back(tpos); 458 | chrNdNum += aRange.ranNum; 459 | } 460 | // 461 | bool fdChr = false; 462 | map >::iterator it; 463 | it = allGeneLay.find(xchr); 464 | if(it != allGeneLay.end()){ 465 | fdChr = true; 466 | }else{ 467 | cout<<"Warning: reference chromosome in the 'chr.list' can't be found in the annotation file. "< ntNode; 470 | vector chrRnode; 471 | chrRnode.reserve(chrNdNum); 472 | for(int j = 0; j < chrNdNum; ++j){ 473 | int node,ndStart,ndEnd; 474 | rnfh.read((char *)&node,intSize); 475 | rnfh.read((char *)&ndStart,intSize); 476 | rnfh.read((char *)&ndEnd,intSize); 477 | // 478 | RNode trnode = {node,ndStart,ndEnd}; 479 | chrRnode.push_back(trnode); 480 | //ntNode.insert(node); 481 | } 482 | //-------------------------- 483 | int sPos = 0; 484 | size_t aPos = 0; 485 | if(fdChr){ 486 | map > gdCutMap; 487 | // 488 | for(size_t i = 0; i < (it->second).size(); ++i){ 489 | BedNode gnode; 490 | bool fnd1 = false, fnd2 = false; 491 | gnode.node1 = 0; 492 | gnode.node2 = 0; 493 | // feature boundary 494 | for(int x = sPos; x < chrNdNum; ++x){ 495 | if((it->second)[i].start >= chrRnode[x].start){ 496 | if((it->second)[i].start <= chrRnode[x].pend){ 497 | gnode.node1 = chrRnode[x].node; 498 | gnode.reStart1 = (it->second)[i].start - chrRnode[x].start; 499 | fnd1 = true; 500 | // 501 | sPos = x; 502 | // 503 | if((it->second)[i].end <= chrRnode[x].pend){ 504 | gnode.node2 = chrRnode[x].node; 505 | gnode.reStart2 = (it->second)[i].end - chrRnode[x].start; 506 | fnd2 = true; 507 | break; 508 | } 509 | }else{ 510 | // check 511 | sPos = x + 1; 512 | } 513 | }else{ 514 | if((it->second)[i].end >= chrRnode[x].start){ 515 | if((it->second)[i].end <= chrRnode[x].pend){ 516 | gnode.node2 = chrRnode[x].node; 517 | gnode.reStart2 = (it->second)[i].end - chrRnode[x].start; 518 | fnd2 = true; 519 | break; 520 | } 521 | }else{ 522 | break; 523 | } 524 | 525 | } 526 | } 527 | // 528 | if(fnd1 || fnd2){ 529 | gnode.name[FIELDSIZE-1] = '\0'; 530 | strncpy(gnode.name,(it->second)[i].name.c_str(),FIELDSIZE-1); 531 | gnode.layer = (it->second)[i].layer; 532 | gnode.strand = (it->second)[i].strand; 533 | gnode.score = (it->second)[i].score; 534 | // assign feature to segmentation 535 | for(size_t k = aPos; k < acrVec.size(); ++k){ 536 | if((it->second)[i].start < acrVec[k].start){ 537 | if((it->second)[i].end >= acrVec[k].start){ 538 | if(gdCutMap.find(k) == gdCutMap.end()){ 539 | vector gvec; 540 | gvec.push_back(gnode); 541 | gdCutMap.emplace(k,gvec); 542 | }else{ 543 | gdCutMap[k].push_back(gnode); 544 | } 545 | }else{ 546 | break; 547 | } 548 | }else{ 549 | if((it->second)[i].start <= acrVec[k].pend){ 550 | if(gdCutMap.find(k) == gdCutMap.end()){ 551 | vector gvec; 552 | gvec.push_back(gnode); 553 | gdCutMap.emplace(k,gvec); 554 | }else{ 555 | gdCutMap[k].push_back(gnode); 556 | } 557 | }else{ 558 | aPos = k + 1; 559 | } 560 | } 561 | } 562 | } 563 | } 564 | // 565 | 566 | for(size_t k = 0; k < acrVec.size(); ++k){ 567 | int num = 0; 568 | if(gdCutMap.find(k) != gdCutMap.end()){ 569 | for(BedNode &gd : gdCutMap[k]){ 570 | ov.write((char *)&gd,ndUnit); 571 | } 572 | num = gdCutMap[k].size(); 573 | } 574 | OneRange aRange; 575 | aRange.ranStart = acrVec[k].start; 576 | aRange.ranEnd = acrVec[k].pend; 577 | aRange.offByte = ndByte; 578 | aRange.ranNum = num; 579 | odx.write((char *)&aRange,oneSize); 580 | // 581 | ndByte += ndUnit * num; 582 | } 583 | }else{ 584 | for(size_t k = 0; k < acrVec.size(); ++k){ 585 | int num = 0; 586 | 587 | OneRange aRange; 588 | aRange.ranStart = acrVec[k].start; 589 | aRange.ranEnd = acrVec[k].pend; 590 | aRange.offByte = ndByte; 591 | aRange.ranNum = num; 592 | odx.write((char *)&aRange,oneSize); 593 | } 594 | } 595 | } 596 | 597 | rxfh.close(); 598 | rnfh.close(); 599 | ov.close(); 600 | odx.close(); 601 | } 602 | 603 | //outDir 604 | void dxRefNodeBed(int kLayer,char *inBed,char *chrMapFile,string &upDir){ 605 | 606 | string bedFile = upDir + "/simplify.bed"; 607 | string tkDesFile = upDir + "/track.info"; 608 | string rndFile = upDir + "/node.ref.bw"; 609 | string rndDxFile = upDir + "/node.ref.bdx"; 610 | 611 | string ovFile = upDir + "/bed.ref.bw"; 612 | string gDxFile = upDir + "/bed.ref.bdx"; 613 | 614 | string chrListFile = upDir + "/chr.list"; 615 | 616 | //if(access(geneFile.c_str(),F_OK) != 0){ 617 | reduceBed(kLayer,inBed,tkDesFile,bedFile); 618 | //} 619 | indexNodeBed(tkDesFile,rndFile,rndDxFile,bedFile,chrListFile,ovFile,gDxFile); 620 | } 621 | 622 | // Chr start end feature strand 623 | void addBed_usage(){ 624 | cout<<"Usage: GraphAnno addBed --inBed --upDir "< Input bed file."< 'upload' directory which including files generated by 'gfa2view' or 'vrpg_preprocess.py'"< maximum layers for each track, by default: 50"< 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "vgraph.h" 9 | #include "gz.h" 10 | 11 | using namespace std; 12 | 13 | // replace unknown with U 14 | 15 | typedef struct{ 16 | int start; 17 | int end; 18 | string id; 19 | string name; 20 | char strand; 21 | uint8_t num; 22 | } GeneInfo; 23 | // id name strand chr start end 24 | bool byPos(GeneInfo &a,GeneInfo &b){ 25 | if(a.start < b.start){ 26 | return true; 27 | } 28 | if(a.start == b.start){ 29 | if(a.end < b.end){ 30 | return true; 31 | } 32 | } 33 | return false; 34 | } 35 | // 36 | // chr start end id name strand layer 37 | // node gene_name gene_start gene_end gene_layer 38 | // ----------- 39 | // | a |--- 40 | // | b | 41 | // node gene_name relative_start gene_len gene_layer 42 | 43 | typedef struct{ 44 | int start; 45 | int end; 46 | string name; 47 | char type; 48 | char strand; 49 | uint8_t layer; 50 | uint8_t num; 51 | } SimGene; 52 | 53 | void chrNameTrans(char *chrMapFile,map &transMap){ 54 | ifstream in(chrMapFile); 55 | if(! in){ 56 | cerr<<"Error: file open failed. "<> gffChr; 68 | strStream >> graphChr; 69 | strStream.clear(); 70 | strStream.str(""); 71 | if(transMap.find(gffChr) == transMap.end()){ 72 | transMap.emplace(gffChr,graphChr); 73 | }else{ 74 | cout<<"Warning: duplicated chromosome. "< transMap; 97 | bool trans = false; 98 | if(chrMapFile != nullptr){ 99 | trans = true; 100 | chrNameTrans(chrMapFile,transMap); 101 | } 102 | // 103 | regex pat0("\\t"); 104 | regex pat1("ID=(.+?)(;|$)"); 105 | regex pat2("Name=(.+?)(;|$)"); 106 | regex pat3("gene$"); 107 | regex pat4("Parent=(.+?)(;|$)"); 108 | string seqID,type,strand,attr,geneID,geneName,parent; 109 | string preSeqID = ""; 110 | string preGeneID = ""; 111 | 112 | string line; 113 | map > allChrGene; 114 | vector chrVec; 115 | 116 | map > emap,cmap,gmap; 117 | vector rParent; 118 | int gStart = 0,gEnd = 0; 119 | map rnaCtmap; 120 | // no exon or no CDS 121 | int rnaCount = 0; 122 | while(getline(in,line)){ 123 | if(line[0] != '#' && line[0] != '\0'){ 124 | sregex_token_iterator pos(line.begin(),line.end(),pat0,-1); 125 | sregex_token_iterator pend; 126 | int i = 0; 127 | bool flag = false, eflag = false, cflag = false; 128 | int start = 0,end = 0; 129 | while(pos != pend){ 130 | switch(i){ 131 | case 0: 132 | seqID = *pos; 133 | break; 134 | case 2: 135 | type = *pos; 136 | break; 137 | case 3: 138 | start = atoi((*pos).str().c_str()); 139 | break; 140 | case 4: 141 | end = atoi((*pos).str().c_str()); 142 | break; 143 | case 6: 144 | strand = *pos; 145 | break; 146 | case 8: 147 | attr = *pos; 148 | break; 149 | default: 150 | break; 151 | } 152 | // 153 | if(i == 2){ 154 | if(regex_search(type,pat3)){ 155 | flag = true; 156 | }else if(type == "exon"){ 157 | eflag = true; 158 | }else if(type == "CDS"){ 159 | cflag = true; 160 | }else{ 161 | flag = false; 162 | break; 163 | } 164 | } 165 | ++pos; 166 | ++i; 167 | } 168 | // 169 | if(flag){ 170 | if(preGeneID != ""){ 171 | rnaCtmap.emplace(preGeneID,rParent.size()); 172 | // 173 | vector transcript; 174 | for(string &par : rParent){ 175 | int x = 1; 176 | if(emap.find(par) != emap.end()){ 177 | sort(emap[par].begin(),emap[par].end(),byPos); 178 | for(GeneInfo &bk : emap[par]){ 179 | bk.num = x; 180 | transcript.push_back(bk); 181 | // 182 | ++x; 183 | } 184 | } 185 | // 186 | x = 1; 187 | if(cmap.find(par) != cmap.end()){ 188 | sort(cmap[par].begin(),cmap[par].end(),byPos); 189 | for(GeneInfo &bk : cmap[par]){ 190 | bk.num = x; 191 | transcript.push_back(bk); 192 | // 193 | ++x; 194 | } 195 | } 196 | } 197 | // 198 | gmap.emplace(preGeneID,transcript); 199 | } 200 | // 201 | emap.clear(); 202 | cmap.clear(); 203 | rParent.clear(); 204 | rnaCount = 0; 205 | // 206 | if(start < 1 || end < 1){ 207 | cerr<<"Error: failed to get feature position. "< tvec; 234 | tvec.push_back(tgene); 235 | allChrGene.emplace(seqID,tvec); 236 | // 237 | chrVec.push_back(seqID); 238 | } 239 | }else{ 240 | allChrGene[seqID].push_back(tgene); 241 | } 242 | preSeqID = seqID; 243 | // 244 | preGeneID = geneID; 245 | gStart = start; 246 | gEnd = end; 247 | }else if(eflag){ 248 | if(start < 1 || end < 1){ 249 | cerr<<"Error: failed to get feature position. "<= gStart && end <= gEnd){ 254 | smatch s; 255 | if(regex_search(attr,s,pat4)){ 256 | parent = s[1]; 257 | if(emap.find(parent) == emap.end()){ 258 | if(rnaCount < rnaMax){ 259 | vector tg; 260 | tg.push_back({start,end,parent,"E",strand[0]}); 261 | emap.emplace(parent,tg); 262 | rParent.push_back(parent); 263 | ++rnaCount; 264 | } 265 | }else{ 266 | emap[parent].push_back({start,end,parent,"E",strand[0]}); 267 | } 268 | }else{ 269 | cerr<<"Error: attribute 'Parent' was not defined. "<= gStart && end <= gEnd){ 282 | smatch s; 283 | if(regex_search(attr,s,pat4)){ 284 | parent = s[1]; 285 | if(cmap.find(parent) == cmap.end()){ 286 | if(emap.find(parent) == emap.end()){ 287 | if(rnaCount < rnaMax){ 288 | vector tg; 289 | tg.push_back({start,end,parent,"C",strand[0]}); 290 | cmap.emplace(parent,tg); 291 | // 292 | rParent.push_back(parent); 293 | ++rnaCount; 294 | } 295 | }else{ 296 | vector tg; 297 | tg.push_back({start,end,parent,"C",strand[0]}); 298 | cmap.emplace(parent,tg); 299 | } 300 | }else{ 301 | cmap[parent].push_back({start,end,parent,"C",strand[0]}); 302 | } 303 | }else{ 304 | cerr<<"Error: attribute 'Parent' was not defined. "< transcript; 319 | for(string &par : rParent){ 320 | int x = 1; 321 | if(emap.find(par) != emap.end()){ 322 | sort(emap[par].begin(),emap[par].end(),byPos); 323 | for(GeneInfo &bk : emap[par]){ 324 | bk.num = x; 325 | transcript.push_back(bk); 326 | // 327 | ++x; 328 | } 329 | } 330 | // 331 | x = 1; 332 | if(cmap.find(par) != cmap.end()){ 333 | sort(cmap[par].begin(),cmap[par].end(),byPos); 334 | for(GeneInfo &bk : cmap[par]){ 335 | bk.num = x; 336 | transcript.push_back(bk); 337 | // 338 | ++x; 339 | } 340 | } 341 | } 342 | // 343 | gmap.emplace(preGeneID,transcript); 344 | } 345 | // 346 | emap.clear(); 347 | cmap.clear(); 348 | rParent.clear(); 349 | for(string &tchr : chrVec ){ 350 | vector &chrGene = allChrGene[tchr]; 351 | sort(chrGene.begin(),chrGene.end(),byPos); 352 | // 353 | int maxLayer = 1; 354 | size_t sPos = 0; 355 | bool overlap = false; 356 | vector layVec; 357 | for(size_t k = 0; k < chrGene.size(); ++k){ 358 | // Does current gene overlaps with previous genes ? 359 | // no overlap; new start 360 | vector layerSta(maxLayer,0); 361 | overlap = false; 362 | for(size_t m = sPos; m < k; ++m){ 363 | if(chrGene[k].start <= chrGene[m].end && chrGene[k].end >= chrGene[m].start){ 364 | layerSta[layVec[m - sPos] - 1] = 1; 365 | overlap = true; 366 | } 367 | } 368 | if(overlap){ 369 | bool fir = true; 370 | for(int n = 0; n < maxLayer; ++n){ 371 | //if(fir){ 372 | if(layerSta[n] == 0){ 373 | layVec.push_back(n+1); 374 | fir = false; 375 | break; 376 | } 377 | //} 378 | } 379 | if(fir){ 380 | ++maxLayer; 381 | layVec.push_back(maxLayer); 382 | } 383 | }else{ 384 | // ------- --------- 385 | // --------------------- 386 | // max rna count 387 | // glayer --------------> layCum 388 | if(k > 0){ 389 | map layMax, layCum; 390 | for(size_t p = sPos; p < k; ++p){ 391 | int rnaNum = rnaCtmap[chrGene[p].id]; 392 | int glayer = layVec[p - sPos]; 393 | 394 | if(layMax.find(glayer) == layMax.end()){ 395 | layMax.emplace(glayer,rnaNum); 396 | }else{ 397 | if(rnaNum > layMax[glayer]){ 398 | layMax[glayer] = rnaNum; 399 | } 400 | } 401 | } 402 | // 403 | if(! layMax.empty()){ 404 | int tnum = 0; 405 | for(auto &itm : layMax){ 406 | layCum.emplace(itm.first,tnum+1); 407 | tnum += itm.second + 1; 408 | } 409 | // 410 | for(size_t j = sPos; j < k; ++j){ 411 | int tlayer = layCum[layVec[j - sPos]]; 412 | out< layMax, layCum; 438 | for(size_t p = sPos; p < chrGene.size(); ++p){ 439 | int rnaNum = rnaCtmap[chrGene[p].id]; 440 | int glayer = layVec[p - sPos]; 441 | 442 | if(layMax.find(glayer) == layMax.end()){ 443 | layMax[glayer] = rnaNum; 444 | }else{ 445 | if(rnaNum > layMax[glayer]){ 446 | layMax[glayer] = rnaNum; 447 | } 448 | } 449 | } 450 | // 451 | if(! layMax.empty()){ 452 | int tnum = 0; 453 | for(auto &itm : layMax){ 454 | layCum.emplace(itm.first,tnum+1); 455 | tnum += itm.second + 1; 456 | } 457 | // 458 | for(size_t j = sPos; j < chrGene.size(); ++j){ 459 | int tlayer = layCum[layVec[j - sPos]]; 460 | out< &refChrMap){ 482 | ifstream in(chrListFile.c_str()); 483 | int i = 0; 484 | string chrLine; 485 | while(getline(in,chrLine)){ 486 | int tpos = chrLine.find("\t"); 487 | string tchr = chrLine.substr(0,tpos); 488 | refChrMap.emplace(tchr,i); 489 | ++i; 490 | } 491 | in.close(); 492 | if(refChrMap.empty()){ 493 | cerr<<"Error: file is empty. "< &refChrMap,map > &allGeneLay){ 502 | ifstream gf(geneFile.c_str()); 503 | if(! gf){ 504 | cerr<<"Error: file open failed. "<> tchr; 520 | strStream >> start; 521 | strStream >> end; 522 | strStream >> id; 523 | strStream >> name; 524 | strStream >> strand; 525 | strStream >> layer; 526 | strStream >> num; 527 | strStream.clear(); 528 | strStream.str(""); 529 | 530 | if(num > 255){ 531 | cerr<<"Warning: exon count > 255. The item ("< 255){ 535 | cerr<<"Warning: layer > 255. Too many RNA or too many overlapping genes. The item ("< tvec; 566 | tvec.push_back(tsim); 567 | allGeneLay.emplace(chrpos,tvec); 568 | } 569 | }else{ 570 | flag = false; 571 | } 572 | }else{ 573 | if(flag){ 574 | allGeneLay[chrpos].push_back(tsim); 575 | } 576 | } 577 | preChr = tchr; 578 | } 579 | gf.close(); 580 | } 581 | 582 | void indexNodeGene(string &rndFile,string &rndDxFile,string &geneFile,string &chrListFile,string &ovFile,string &gDxFile){ 583 | ofstream ov(ovFile.c_str()); 584 | if(! ov){ 585 | cerr<<"Error: file open failed. "< refChrMap; 595 | getDxRef(chrListFile,refChrMap); 596 | // 597 | map > allGeneLay; 598 | getGeneMap(geneFile,refChrMap,allGeneLay); 599 | //--------------------------------------------- 600 | ifstream rxfh(rndDxFile.c_str()); 601 | if(! rxfh){ 602 | cerr<<"Error: file open failed. "< allchr; 617 | allchr.reserve(nchr); 618 | map chrRanMap; 619 | for(int t = 0; t < nchr; ++t){ 620 | int tchr; 621 | ChrRange cRange; 622 | rxfh.read((char *)&tchr,intSize); 623 | rxfh.read((char *)&cRange,crSize); 624 | 625 | odx.write((char *)&tchr,intSize); 626 | odx.write((char *)&cRange,crSize); 627 | 628 | allchr.push_back(tchr); 629 | chrRanMap.emplace(tchr,cRange); 630 | } 631 | // 632 | int oneSize = sizeof(OneRange); 633 | long long ndByte = 0,ndUnit = sizeof(GeneNode); 634 | for(int xchr : allchr){ 635 | ChrRange cRange = chrRanMap[xchr]; 636 | // 637 | vector acrVec; 638 | acrVec.reserve(cRange.ranNum); 639 | int chrNdNum = 0; 640 | for(int k = 0; k < cRange.ranNum; ++k){ 641 | OneRange aRange; 642 | rxfh.read((char *)&aRange,oneSize); 643 | RanPos tpos = {aRange.ranStart,aRange.ranEnd}; 644 | acrVec.push_back(tpos); 645 | chrNdNum += aRange.ranNum; 646 | } 647 | // 648 | bool fdChr = false; 649 | map >::iterator it; 650 | it = allGeneLay.find(xchr); 651 | if(it != allGeneLay.end()){ 652 | fdChr = true; 653 | }else{ 654 | cout<<"Warning: reference chromosome in the 'chr.list' can't be found in the annotation file. "< ntNode; 657 | vector chrRnode; 658 | chrRnode.reserve(chrNdNum); 659 | for(int j = 0; j < chrNdNum; ++j){ 660 | int node,ndStart,ndEnd; 661 | rnfh.read((char *)&node,intSize); 662 | rnfh.read((char *)&ndStart,intSize); 663 | rnfh.read((char *)&ndEnd,intSize); 664 | // 665 | RNode trnode = {node,ndStart,ndEnd}; 666 | chrRnode.push_back(trnode); 667 | //ntNode.insert(node); 668 | } 669 | //-------------------------- 670 | int sPos = 0; 671 | size_t aPos = 0; 672 | if(fdChr){ 673 | map > gdCutMap; 674 | // 675 | for(size_t i = 0; i < (it->second).size(); ++i){ 676 | GeneNode gnode; 677 | bool fnd1 = false, fnd2 = false; 678 | gnode.node1 = -1; 679 | gnode.node2 = -1; 680 | // feature boundary 681 | for(int x = sPos; x < chrNdNum; ++x){ 682 | if((it->second)[i].start >= chrRnode[x].start){ 683 | if((it->second)[i].start <= chrRnode[x].pend){ 684 | gnode.node1 = chrRnode[x].node; 685 | gnode.reStart1 = (it->second)[i].start - chrRnode[x].start; 686 | fnd1 = true; 687 | // 688 | if((it->second)[i].type == 'G'){ 689 | sPos = x; 690 | } 691 | // 692 | if((it->second)[i].end <= chrRnode[x].pend){ 693 | gnode.node2 = chrRnode[x].node; 694 | gnode.reStart2 = (it->second)[i].end - chrRnode[x].start; 695 | fnd2 = true; 696 | break; 697 | } 698 | }else{ 699 | // check 700 | if((it->second)[i].type == 'G'){ 701 | sPos = x + 1; 702 | } 703 | } 704 | }else{ 705 | if((it->second)[i].end >= chrRnode[x].start){ 706 | if((it->second)[i].end <= chrRnode[x].pend){ 707 | gnode.node2 = chrRnode[x].node; 708 | gnode.reStart2 = (it->second)[i].end - chrRnode[x].start; 709 | fnd2 = true; 710 | break; 711 | } 712 | }else{ 713 | break; 714 | } 715 | 716 | } 717 | } 718 | // 719 | if(fnd1 || fnd2){ 720 | gnode.name[FIELDSIZE-1] = '\0'; 721 | strncpy(gnode.name,(it->second)[i].name.c_str(),FIELDSIZE-1); 722 | gnode.layer = (it->second)[i].layer; 723 | gnode.strand = (it->second)[i].strand; 724 | gnode.type = (it->second)[i].type; 725 | gnode.num = (it->second)[i].num; 726 | // assign feature to segmentation 727 | for(size_t k = aPos; k < acrVec.size(); ++k){ 728 | if((it->second)[i].start < acrVec[k].start){ 729 | if((it->second)[i].end >= acrVec[k].start){ 730 | if(gdCutMap.find(k) == gdCutMap.end()){ 731 | vector gvec; 732 | gvec.push_back(gnode); 733 | gdCutMap.emplace(k,gvec); 734 | }else{ 735 | gdCutMap[k].push_back(gnode); 736 | } 737 | }else{ 738 | break; 739 | } 740 | }else{ 741 | if((it->second)[i].start <= acrVec[k].pend){ 742 | if(gdCutMap.find(k) == gdCutMap.end()){ 743 | vector gvec; 744 | gvec.push_back(gnode); 745 | gdCutMap.emplace(k,gvec); 746 | }else{ 747 | gdCutMap[k].push_back(gnode); 748 | } 749 | }else{ 750 | if((it->second)[i].type == 'G'){ 751 | aPos = k + 1; 752 | } 753 | } 754 | } 755 | } 756 | } 757 | } 758 | // 759 | 760 | for(size_t k = 0; k < acrVec.size(); ++k){ 761 | int num = 0; 762 | if(gdCutMap.find(k) != gdCutMap.end()){ 763 | for(GeneNode &gd : gdCutMap[k]){ 764 | ov.write((char *)&gd,ndUnit); 765 | } 766 | num = gdCutMap[k].size(); 767 | } 768 | OneRange aRange; 769 | aRange.ranStart = acrVec[k].start; 770 | aRange.ranEnd = acrVec[k].pend; 771 | aRange.offByte = ndByte; 772 | aRange.ranNum = num; 773 | odx.write((char *)&aRange,oneSize); 774 | // 775 | ndByte += ndUnit * num; 776 | } 777 | }else{ 778 | for(size_t k = 0; k < acrVec.size(); ++k){ 779 | int num = 0; 780 | OneRange aRange; 781 | aRange.ranStart = acrVec[k].start; 782 | aRange.ranEnd = acrVec[k].pend; 783 | aRange.offByte = ndByte; 784 | aRange.ranNum = num; 785 | odx.write((char *)&aRange,oneSize); 786 | } 787 | } 788 | } 789 | 790 | rxfh.close(); 791 | rnfh.close(); 792 | ov.close(); 793 | odx.close(); 794 | } 795 | 796 | //outDir 797 | void dxRefNodeGene(int rnaMax,char *inGFF,char *chrMapFile,string &upDir){ 798 | 799 | string geneFile = upDir + "/simplify.gene"; 800 | string rndFile = upDir + "/node.ref.bw"; 801 | string rndDxFile = upDir + "/node.ref.bdx"; 802 | 803 | string ovFile = upDir + "/gene.ref.bw"; 804 | string gDxFile = upDir + "/gene.ref.bdx"; 805 | 806 | string chrListFile = upDir + "/chr.list"; 807 | 808 | //if(access(geneFile.c_str(),F_OK) != 0){ 809 | reduceGFF(rnaMax,inGFF,chrMapFile,geneFile); 810 | //} 811 | indexNodeGene(rndFile,rndDxFile,geneFile,chrListFile,ovFile,gDxFile); 812 | } 813 | 814 | // Chr start end feature strand 815 | void addRef_usage(){ 816 | cout<<"Usage: GraphAnno addRef --inGFF --chrTrans --upDir "< Input GFF file."< Input file containing two columns separated by whitespaces for each line (chromosome/contig name in GFF file and chromosome/contig name in Graph file)." 820 | "This file is used to make the chromosome/contig names in GFF file matching that in graph file." 821 | "If chromosome name transformation is not needed this file can be ignored."< Maximum number of RNA isoforms per gene to display. By default: 20."< 'upload' directory which including files generated by 'gfa2view' or 'vrpg_preprocess.py'."<