├── python
    ├── msa_4d
    │   ├── test_data
    │   │   └── test
    │   └── readme.md
    └── zscore
    │   ├── zscore.py
    │   └── zscore2.py
├── deal_fasta
    ├── fa2phy
    │   ├── readme
    │   ├── fa2phy.py
    │   ├── fasta2phylip.pl
    │   └── fa2phy.v2.py
    ├── fasta_cut
    │   └── readme
    ├── agp2fa
    │   ├── readme
    │   ├── agp2fa.pl
    │   └── ragtag_agp2fa.py
    ├── split_fast_pilon
    │   └── README
    ├── filter_err_fasta
    │   ├── find_err_dna.py
    │   └── find_err_pep.py
    ├── six_frame_translate
    │   └── translate_seq.py
    └── rename
    │   └── rename.fa.py
├── deal_gff
    ├── fish
    │   └── readme
    ├── rm_overlap
    │   └── readme
    ├── gff.simple
    │   ├── readme
    │   ├── gff.simple.pl
    │   └── EVMtoBGI.py
    ├── find_overlap
    │   └── readme
    ├── gene_rename
    │   └── change.name.pl
    ├── pick_longest_gene
    │   ├── fix_mRNA_coordinate.pl
    │   ├── fix_phase.py
    │   ├── deal.sh
    │   ├── pick_longest_ncbi.pl
    │   └── toBGI.py
    └── agp2gff
    │   └── agp2gff.py
├── Comparative_genomics
    ├── kaks
    │   ├── blast.sh
    │   ├── collinearity_kaks.sh
    │   ├── genelist_kaks.sh
    │   └── go_kaks.sh
    ├── short_Peptide
    │   ├── readme.md
    │   └── short_Peptide_predict
    ├── deal_tree_nwk
    │   ├── step9.sh
    │   ├── step7.reserve_species.txt
    │   ├── step8.raw.tree.format_9.nwk
    │   ├── step5.all.leaves.txt
    │   ├── step8.raw.tree.format_6.nwk
    │   ├── step8.raw.tree.format_8.nwk
    │   ├── step1.sh
    │   ├── step6.raw.tree.delete_Homo.nwk
    │   ├── step8.raw.tree.format_4.nwk
    │   ├── raw.tree.deal.nwk
    │   ├── raw.tree.nwk
    │   ├── step2.sh
    │   ├── step5.sh
    │   ├── step8.raw.tree.format_0.nwk
    │   ├── step8.raw.tree.format_1.nwk
    │   ├── step8.raw.tree.format_2.nwk
    │   ├── step8.raw.tree.format_5.nwk
    │   ├── step8.raw.tree.format_7.nwk
    │   ├── raw.tree.reroot.Homo.nwk
    │   ├── step6.old_new.change.txt
    │   ├── step6.raw.tree.rename_Homo.nwk
    │   ├── step8.sh
    │   ├── step2.raw.tree.reroot.Homo.nwk
    │   ├── step3.raw.tree.sort.decreasing.nwk
    │   ├── step3.raw.tree.sort.increasing.nwk
    │   ├── step4.raw.tree.cladogram_transform.nwk
    │   ├── step8.raw.tree.format_3.nwk
    │   ├── step4.sh
    │   ├── step7.delete_Homo.txt
    │   ├── raw.tree.nwk.pdf
    │   ├── step1.txt
    │   ├── step2.txt
    │   ├── step3.decreasing.txt
    │   ├── step3.increasing.txt
    │   ├── step6.rename_Homo.txt
    │   ├── step7.sh
    │   ├── step6.sh
    │   ├── readme.md
    │   ├── step3.sh
    │   └── draw.r
    ├── get_gene_infomation
    │   ├── get.sh
    │   └── deal.sh
    ├── gene_family
    │   ├── pipline.png
    │   ├── 05_phylogenetic.sh
    │   ├── 01_blastp.sh
    │   ├── 02_hmm.sh
    │   ├── 04_final_gene_family.sh
    │   ├── 03_miniprot.sh
    │   └── 06_rna_seq.sh
    ├── Dotplot_two_genomes
    │   ├── hcs_fcs.paf.png
    │   └── dotplot.sh
    ├── Signal_peptide
    │   ├── Razor.sh
    │   └── DeepSig.sh
    ├── gene_family_cluster
    │   ├── sonicparanoid.sh
    │   ├── orthofinder.sh
    │   ├── sonicparanoid2.sh
    │   └── broccoli.sh
    ├── blast
    │   ├── blast-costom-outformat.sh
    │   ├── reciprocal_best_hits.sh
    │   └── diamond_rbh.R
    ├── gene_cluster
    │   └── Galeon.sh
    └── Domain_predict
    │   └── rpsblast.sh
├── genome
    ├── pseudogenes
    │   ├── readme.md
    │   ├── step2.sh
    │   └── step1.sh
    ├── Anno_RNA
    │   ├── minimap2
    │   │   ├── readme.md
    │   │   ├── minimap2.sh
    │   │   ├── step3.sh
    │   │   └── step2.sh
    │   └── GMAP
    │   │   ├── index.sh
    │   │   └── map.sh
    ├── Hic
    │   ├── rfy_hic2 Pipeline.pdf
    │   ├── haphic
    │   │   ├── split_haphic_step3.sh
    │   │   ├── haphic_juicer_post.sh
    │   │   ├── split_haphic_step4.sh
    │   │   ├── split_haphic_step1.sh
    │   │   ├── split_haphic_step2.sh
    │   │   ├── bwa.sh
    │   │   ├── split_haphic_step0.sh
    │   │   ├── haphic.sh
    │   │   ├── re_draw.sh
    │   │   └── draw.sh
    │   ├── yahs
    │   │   ├── step2_juicer_post.sh
    │   │   └── step1_ass.sh
    │   └── all_hic.sh
    ├── assess
    │   ├── TeloExplorer.sh
    │   ├── omark.sh
    │   ├── busco-5.5.sh
    │   ├── CRAQ.sh
    │   ├── GCI_pb.sh
    │   ├── compleasm.sh
    │   └── LAI.sh
    ├── Anno_EGAPx
    │   ├── local2.yaml
    │   ├── egapx.03.1.sh
    │   ├── fix_mRNA_coordinate.pl
    │   ├── fix_phase.py
    │   ├── toBGI.py
    │   └── deal_egapx.sh
    ├── puerge_halp
    │   ├── kmerdup
    │   │   ├── readme.md
    │   │   ├── bowtie2_demo.sh
    │   │   ├── step4_refilter.sh
    │   │   ├── miniprot.sh
    │   │   ├── step3_filter.sh
    │   │   └── step1_prepare.sh
    │   ├── purge_haplotigs
    │   │   ├── step2_custom_set.sh
    │   │   └── step1_map.sh
    │   └── purge_dups.sh
    ├── TE
    │   ├── TEsorter.sh
    │   ├── NeuralTE.sh
    │   ├── DeepTE.sh
    │   └── HiTEv3.2.sh
    ├── Anno_homology
    │   ├── GeMoMa
    │   │   └── GeMoMa.sh
    │   ├── Spaln
    │   │   └── Spaln.sh
    │   ├── gth
    │   │   └── gth.sh
    │   └── miniprot
    │   │   └── miniprot.sh
    ├── quick_merge_genome
    │   └── quickmerge.sh
    ├── Mit_genome
    │   ├── mitoz.sh
    │   ├── fa2gb.py
    │   └── check_species_by_mit_pep.sh
    ├── Anno_integrate
    │   ├── alignAssembly.config
    │   ├── evm_auto.sh
    │   ├── annotCompare.config
    │   └── pasa.sh
    ├── Anno_EviAnn
    │   └── EviAnn.sh
    ├── Segmental_duplication
    │   ├── biser.sh
    │   └── biser_split.sh
    ├── Telomere
    │   └── Telomere_tidk.sh
    ├── Anno_denovo
    │   ├── galba.sh
    │   └── helixer.sh
    ├── evaluate_orf_cds
    │   └── evaluate_orf_cds.sh
    ├── noncoding
    │   └── noncoding_predict.sh
    ├── ragtag
    │   ├── ragtag.sh
    │   └── filter.pl
    ├── evaluate_genome_size
    │   └── evaluate_genome_size.sh
    ├── Genome_error_correction
    │   └── Pilon&racon.sh
    └── relernn
    │   └── All.prediction.sh
├── transcriptome
    ├── coding_potential_calculator
    │   └── readme.md
    ├── full_length_transcriptome
    │   └── flair_analyze_NCBI_SRA_full_length_transcriptome.sh
    └── Enrich
    │   ├── make_Orgdb.sh
    │   ├── AnnotationForge_20250117.R
    │   ├── eggnog-2.1.9.sh
    │   └── enrich.r
├── picture
    ├── heatscatter
    │   ├── example.tsv
    │   └── heatscatter.r
    ├── heatscatter2
    │   ├── example.tsv
    │   └── heatscatter.r
    ├── DensityHeatmap
    │   ├── huoli.r
    │   └── huoli2.r
    ├── QQplot
    │   └── QQplot.R
    ├── line
    │   └── line.R
    ├── Manhattan
    │   ├── qqman_qq_mhd.r
    │   └── Manhattan.R
    ├── Normality.Test2
    │   └── Normality.Test2.R
    ├── box2
    │   └── box.R
    ├── genome_Circos
    │   ├── ticks.conf
    │   ├── circos.conf
    │   └── fast_Circos.sh
    ├── loess_fit
    │   └── loess_fit.R
    ├── syri_plotsv
    │   ├── base.demo.config
    │   └── syri_plotsv.sh
    ├── synteny_circos
    │   ├── simpletolink.py
    │   └── circos_sys.sh
    ├── box
    │   └── box.R
    └── GC_depth
    │   └── depth_gc.r
├── other
    ├── filter_fasta_non-ATCGN_characters.pl
    ├── outlier2.py
    ├── outlier.py
    ├── check_pid_info.sh
    └── count_directory_num_size.sh
└── readme.md


/python/msa_4d/test_data/test:
--------------------------------------------------------------------------------
1 | test
2 | 


--------------------------------------------------------------------------------
/deal_fasta/fa2phy/readme:
--------------------------------------------------------------------------------
1 | Thanks to Wenjie Deng
2 | 


--------------------------------------------------------------------------------
/deal_gff/fish/readme:
--------------------------------------------------------------------------------
1 | Thanks to fanw@genomics.org.cn
2 | 


--------------------------------------------------------------------------------
/deal_fasta/fasta_cut/readme:
--------------------------------------------------------------------------------
1 | Thanks to fanw@genomics.org.cn
2 | 


--------------------------------------------------------------------------------
/Comparative_genomics/kaks/blast.sh:
--------------------------------------------------------------------------------
1 | ../blast/blast-costom-outformat.sh


--------------------------------------------------------------------------------
/Comparative_genomics/short_Peptide/readme.md:
--------------------------------------------------------------------------------
1 | GPU or CPU is ok
2 | 


--------------------------------------------------------------------------------
/deal_gff/rm_overlap/readme:
--------------------------------------------------------------------------------
1 | Thanks to lijianwen@genomics.org.cn
2 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step9.sh:
--------------------------------------------------------------------------------
1 | ./draw.r raw.tree.nwk raw.tree.nwk
2 | 


--------------------------------------------------------------------------------
/deal_gff/gff.simple/readme:
--------------------------------------------------------------------------------
1 | Thanks to hankai@genomics.cn
2 | Thanks to songyue@genomics.cn
3 | 


--------------------------------------------------------------------------------
/genome/pseudogenes/readme.md:
--------------------------------------------------------------------------------
1 | fork https://github.com/kelkar/Discover_pseudogenes and modify
2 | 


--------------------------------------------------------------------------------
/transcriptome/coding_potential_calculator/readme.md:
--------------------------------------------------------------------------------
1 | https://github.com/gao-lab/CPC2_standalone
2 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step7.reserve_species.txt:
--------------------------------------------------------------------------------
1 | Pongo
2 | Macaca
3 | Ateles
4 | Galago
5 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step8.raw.tree.format_9.nwk:
--------------------------------------------------------------------------------
1 | ((((Homo,Pongo),Macaca),Ateles),Galago);


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step5.all.leaves.txt:
--------------------------------------------------------------------------------
1 | Homo
2 | Pongo
3 | Macaca
4 | Ateles
5 | Galago
6 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step8.raw.tree.format_6.nwk:
--------------------------------------------------------------------------------
1 | ((((Homo,Pongo):0.28,Macaca):0.13,Ateles):0.38,Galago);


--------------------------------------------------------------------------------
/genome/Anno_RNA/minimap2/readme.md:
--------------------------------------------------------------------------------
1 | sam2gff.pl from  https://github.com/gpertea/gscripts/blob/master/sam2gff.pl 
2 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step8.raw.tree.format_8.nwk:
--------------------------------------------------------------------------------
1 | ((((Homo,Pongo)NoName,Macaca)NoName,Ateles)NoName,Galago);


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step1.sh:
--------------------------------------------------------------------------------
1 | ## print the tree shape on terminal
2 | tree_deal.py -s raw.tree.nwk > step1.txt
3 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step6.raw.tree.delete_Homo.nwk:
--------------------------------------------------------------------------------
1 | (((Macaca:0.49,Pongo:0.21)1:0.13,Ateles:0.62)1:0.38,Galago:1);


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step8.raw.tree.format_4.nwk:
--------------------------------------------------------------------------------
1 | ((((Homo:0.21,Pongo:0.21),Macaca:0.49),Ateles:0.62),Galago:1);


--------------------------------------------------------------------------------
/Comparative_genomics/get_gene_infomation/get.sh:
--------------------------------------------------------------------------------
1 | ## NCBI dataset
2 | datasets summary gene accession --report product ${i}
3 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/raw.tree.deal.nwk:
--------------------------------------------------------------------------------
1 | ((((Homo:0.21,Pongo:0.21)1:0.28,Macaca:0.49)1:0.13,Ateles:0.62)1:0.38,Galago:1);


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/raw.tree.nwk:
--------------------------------------------------------------------------------
1 | ((((Homo:0.21,Pongo:0.21):0.28,Macaca:0.49):0.13,Ateles:0.62):0.38,Galago:1.00);
2 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step2.sh:
--------------------------------------------------------------------------------
1 | ## reroot tree
2 | tree_deal.py -r -o Homo raw.tree.nwk > step2.raw.tree.reroot.Homo.nwk
3 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step5.sh:
--------------------------------------------------------------------------------
1 | ## get all tips of terminal leaves
2 | tree_deal.py -t raw.tree.nwk > step5.all.leaves.txt
3 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step8.raw.tree.format_0.nwk:
--------------------------------------------------------------------------------
1 | ((((Homo:0.21,Pongo:0.21)1:0.28,Macaca:0.49)1:0.13,Ateles:0.62)1:0.38,Galago:1);


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step8.raw.tree.format_1.nwk:
--------------------------------------------------------------------------------
1 | ((((Homo:0.21,Pongo:0.21):0.28,Macaca:0.49):0.13,Ateles:0.62):0.38,Galago:1);


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step8.raw.tree.format_2.nwk:
--------------------------------------------------------------------------------
1 | ((((Homo:0.21,Pongo:0.21)1:0.28,Macaca:0.49)1:0.13,Ateles:0.62)1:0.38,Galago:1);


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step8.raw.tree.format_5.nwk:
--------------------------------------------------------------------------------
1 | ((((Homo:0.21,Pongo:0.21):0.28,Macaca:0.49):0.13,Ateles:0.62):0.38,Galago:1);


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step8.raw.tree.format_7.nwk:
--------------------------------------------------------------------------------
1 | ((((Homo:0.21,Pongo:0.21)NoName,Macaca:0.49)NoName,Ateles:0.62)NoName,Galago:1);


--------------------------------------------------------------------------------
/deal_gff/find_overlap/readme:
--------------------------------------------------------------------------------
1 | Thanks to fanw@genomics.org.cn
2 | Thanks to huangqf@genomics.org.cn
3 | Thanks to qiufeng@genomics.org.cn
4 | 


--------------------------------------------------------------------------------
/genome/Hic/rfy_hic2 Pipeline.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Biols0208/Self-use-bioinformatics-scripts/HEAD/genome/Hic/rfy_hic2 Pipeline.pdf


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/raw.tree.reroot.Homo.nwk:
--------------------------------------------------------------------------------
1 | (Homo:0.105,(Pongo:0.21,(Macaca:0.49,(Ateles:0.62,Galago:1.38)1:0.13)1:0.28)1:0.105);


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step6.old_new.change.txt:
--------------------------------------------------------------------------------
1 | Homo	ren_1a
2 | Pongo	Pongo
3 | Macaca	Macaca
4 | Ateles	Ateles
5 | Galago	Galago
6 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step6.raw.tree.rename_Homo.nwk:
--------------------------------------------------------------------------------
1 | ((((ren_1a:0.21,Pongo:0.21)1:0.28,Macaca:0.49)1:0.13,Ateles:0.62)1:0.38,Galago:1);


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step8.sh:
--------------------------------------------------------------------------------
1 | for i in ${0..9}; do tree_deal.py --ouf ${i} raw.tree.nwk > step8.raw.tree.format_${i}.nwk; done
2 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step2.raw.tree.reroot.Homo.nwk:
--------------------------------------------------------------------------------
1 | (Homo:0.105,(Pongo:0.21,(Macaca:0.49,(Ateles:0.62,Galago:1.38)1:0.13)1:0.28)1:0.105);


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step3.raw.tree.sort.decreasing.nwk:
--------------------------------------------------------------------------------
1 | (Galago:1,(Ateles:0.62,(Macaca:0.49,(Homo:0.21,Pongo:0.21)1:0.28)1:0.13)1:0.38);


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step3.raw.tree.sort.increasing.nwk:
--------------------------------------------------------------------------------
1 | ((((Pongo:0.21,Homo:0.21)1:0.28,Macaca:0.49)1:0.13,Ateles:0.62)1:0.38,Galago:1);


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step4.raw.tree.cladogram_transform.nwk:
--------------------------------------------------------------------------------
1 | ((((Homo:0.21,Pongo:0.21)1:0.28,Macaca:0.49)1:0.13,Ateles:0.62)1:0.38,Galago:1);


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step8.raw.tree.format_3.nwk:
--------------------------------------------------------------------------------
1 | ((((Homo:0.21,Pongo:0.21)NoName:0.28,Macaca:0.49)NoName:0.13,Ateles:0.62)NoName:0.38,Galago:1);


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step4.sh:
--------------------------------------------------------------------------------
1 | ## transform braches by cladogram method
2 | tree_deal.py -c raw.tree.nwk > step4.raw.tree.cladogram_transform.nwk
3 | 


--------------------------------------------------------------------------------
/Comparative_genomics/gene_family/pipline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Biols0208/Self-use-bioinformatics-scripts/HEAD/Comparative_genomics/gene_family/pipline.png


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step7.delete_Homo.txt:
--------------------------------------------------------------------------------
1 | 
2 |          /-Macaca
3 |       /-|
4 |    /-|   \-Pongo
5 |   |  |
6 | --|   \-Ateles
7 |   |
8 |    \-Galago
9 | 


--------------------------------------------------------------------------------
/picture/heatscatter/example.tsv:
--------------------------------------------------------------------------------
 1 | gene	trans
 2 | 1083	270
 3 | 318	330
 4 | 3573	3867
 5 | 570	570
 6 | 855	786
 7 | 537	528
 8 | 9540	9540
 9 | 954	738
10 | 717	606
11 | 


--------------------------------------------------------------------------------
/picture/heatscatter2/example.tsv:
--------------------------------------------------------------------------------
 1 | gene	trans
 2 | 1083	270
 3 | 318	330
 4 | 3573	3867
 5 | 570	570
 6 | 855	786
 7 | 537	528
 8 | 9540	9540
 9 | 954	738
10 | 717	606
11 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/raw.tree.nwk.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Biols0208/Self-use-bioinformatics-scripts/HEAD/Comparative_genomics/deal_tree_nwk/raw.tree.nwk.pdf


--------------------------------------------------------------------------------
/deal_fasta/agp2fa/readme:
--------------------------------------------------------------------------------
1 | Thanks to Sen Wang, wangsen1993@163.com 
2 |           https://github.com/malonge/RagTag
3 |           https://mp.weixin.qq.com/s/QXDCZz88e6ubl4YgZKcfWQ
4 | 


--------------------------------------------------------------------------------
/Comparative_genomics/Dotplot_two_genomes/hcs_fcs.paf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Biols0208/Self-use-bioinformatics-scripts/HEAD/Comparative_genomics/Dotplot_two_genomes/hcs_fcs.paf.png


--------------------------------------------------------------------------------
/genome/Anno_RNA/GMAP/index.sh:
--------------------------------------------------------------------------------
1 | mkdir reference
2 | 
3 | reference=$PWD/reference
4 | species=HCS
5 | genome=HCS.fa
6 | 
7 | /gmap/bin/gmap_build -D $reference -d ${species}_reference $genome
8 | 


--------------------------------------------------------------------------------
/genome/assess/TeloExplorer.sh:
--------------------------------------------------------------------------------
1 | 
2 | genome=Juicer.FINAL.fa
3 | type_use=animal
4 | prefix=Telo
5 | 
6 | quartet.py TeloExplorer -i ${genome} -c ${type_use} -p ${prefix}
7 | mv tmp ${prefix}_detail
8 | 


--------------------------------------------------------------------------------
/genome/Anno_EGAPx/local2.yaml:
--------------------------------------------------------------------------------
1 | genome: /test/data/genome.fa
2 | taxid: 34787
3 | annotation_provider: ABCD
4 | annotation_name_prefix: test
5 | locus_tag_prefix: test
6 | # proteins: /test/data/train.pep
7 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step1.txt:
--------------------------------------------------------------------------------
 1 | 
 2 |             /-Homo
 3 |          /-|
 4 |       /-|   \-Pongo
 5 |      |  |
 6 |    /-|   \-Macaca
 7 |   |  |
 8 | --|   \-Ateles
 9 |   |
10 |    \-Galago
11 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step2.txt:
--------------------------------------------------------------------------------
 1 | 
 2 |    /-Homo
 3 | --|
 4 |   |   /-Pongo
 5 |    \-|
 6 |      |   /-Macaca
 7 |       \-|
 8 |         |   /-Ateles
 9 |          \-|
10 |             \-Galago
11 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step3.decreasing.txt:
--------------------------------------------------------------------------------
 1 | 
 2 |    /-Galago
 3 | --|
 4 |   |   /-Ateles
 5 |    \-|
 6 |      |   /-Macaca
 7 |       \-|
 8 |         |   /-Homo
 9 |          \-|
10 |             \-Pongo
11 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step3.increasing.txt:
--------------------------------------------------------------------------------
 1 | 
 2 |             /-Pongo
 3 |          /-|
 4 |       /-|   \-Homo
 5 |      |  |
 6 |    /-|   \-Macaca
 7 |   |  |
 8 | --|   \-Ateles
 9 |   |
10 |    \-Galago
11 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step6.rename_Homo.txt:
--------------------------------------------------------------------------------
 1 | 
 2 |             /-ren_1a
 3 |          /-|
 4 |       /-|   \-Pongo
 5 |      |  |
 6 |    /-|   \-Macaca
 7 |   |  |
 8 | --|   \-Ateles
 9 |   |
10 |    \-Galago
11 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step7.sh:
--------------------------------------------------------------------------------
1 | ## prune tree
2 | tree_deal.py --prune step7.reserve_species.txt raw.tree.nwk > step6.raw.tree.delete_Homo.nwk
3 | tree_deal.py -s step6.raw.tree.delete_Homo.nwk > step7.delete_Homo.txt
4 | 


--------------------------------------------------------------------------------
/picture/DensityHeatmap/huoli.r:
--------------------------------------------------------------------------------
1 | args <- commandArgs(T)
2 | 
3 | library(LSD)
4 | pdf(paste(args[1], ".pdf", sep = "", collapse = ""))
5 | rawcount <- read.table(args[1], header = T, sep="\t")
6 | heatscatter(rawcount[,1],rawcount[,2])
7 | 


--------------------------------------------------------------------------------
/picture/heatscatter/heatscatter.r:
--------------------------------------------------------------------------------
1 | library(LSD)
2 | args <- commandArgs (T)
3 | 
4 | pdf(paste(args[1], ".pdf", sep = "", collapse = ""))
5 | rawcount <- read.table(args[1], header = T, sep="\t")
6 | heatscatter(rawcount[,1],rawcount[,2])
7 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step6.sh:
--------------------------------------------------------------------------------
1 | ## rename leaves; TAB delimitor
2 | tree_deal.py --rename step6.old_new.change.txt raw.tree.nwk > step6.raw.tree.rename_Homo.nwk
3 | tree_deal.py -s step6.raw.tree.rename_Homo.nwk > step6.rename_Homo.txt
4 | 


--------------------------------------------------------------------------------
/genome/puerge_halp/kmerdup/readme.md:
--------------------------------------------------------------------------------
1 | 1. Use protein sequences of closely related species for rapid homology annotation to assist in determining blacklists and whitelists
2 | 2. Combined with Hi-C heat map and comprehensive judgment to remove duplicates
3 | 


--------------------------------------------------------------------------------
/genome/TE/TEsorter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source activate /01_soft/mamba/envs/TEsorter
 3 | 
 4 | input=confident_TE.cons_valid.fa
 5 | cpu=5
 6 | 
 7 | TEsorter ${input} -p ${cpu} -rule 70-50-50
 8 | mkdir TEsorter_out
 9 | mv ${input}.rexdb* TEsorter_out
10 | 


--------------------------------------------------------------------------------
/deal_fasta/split_fast_pilon/README:
--------------------------------------------------------------------------------
1 | ## one split nead at least 5 process
2 | python3 pilon_pipeline.py \
3 |     --genome your_genome.fasta \
4 |     --reads1 reads_1.fq \
5 |     --reads2 reads_2.fq \
6 |     --parts N \
7 |     --fix all \
8 |     --output pilon_output
9 | 


--------------------------------------------------------------------------------
/genome/Hic/haphic/split_haphic_step3.sh:
--------------------------------------------------------------------------------
 1 | ### https://github.com/zengxiaofei/HapHiC
 2 | 
 3 | micromamba activate haphic
 4 | 
 5 | genome=corrected_asm.fa
 6 | cpu=80
 7 | nchrs=23
 8 | 
 9 | haphic sort ${genome} HT_links.pkl split_clms final_groups/group*.txt --processes ${cpu}
10 | 


--------------------------------------------------------------------------------
/Comparative_genomics/Dotplot_two_genomes/dotplot.sh:
--------------------------------------------------------------------------------
1 | minimap2 -t 40 -x asm5 hcs_chr-genome.fa fcs_chr-genome.fa > hcs_fcs.paf
2 | 
3 | ### pafCoordsDotPlotly.R need Rpackage optparse  ggplot2   plotly
4 | 
5 | pafCoordsDotPlotly.R -i hcs_fcs.paf -o hcs_fcs.paf -m 1000 -q 1000 -s -t -l -p 16
6 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/readme.md:
--------------------------------------------------------------------------------
1 | 1. print the tree shape on terminal
2 | 2. reroot tree
3 | 3. sort the tree node  0 for decreasing     1 for increasing
4 | 4. transform braches by cladogram method
5 | 5. get all tips of terminal leaves
6 | 6. rename leaves; TAB delimitor
7 | 7. prune tree
8 | 


--------------------------------------------------------------------------------
/genome/Anno_homology/GeMoMa/GeMoMa.sh:
--------------------------------------------------------------------------------
1 | jar=GeMoMa-1.9.jar
2 | 
3 | target_genome=ABCD.fa
4 | pep=all.pep.cdhit.fa
5 | threads=5
6 | out=GeMoMa
7 | 
8 | java -Xms5G -Xmx10G -jar ${jar} CLI GeMoMaPipeline threads=${threads} AnnotationFinalizer.r=NO p=false o=true t=$target_genome outdir=$out s=pre-extracted c=$pep
9 | 


--------------------------------------------------------------------------------
/genome/Hic/haphic/haphic_juicer_post.sh:
--------------------------------------------------------------------------------
1 | edited_assembly=out_JBAT.review.assembly
2 | out_final_assembly_prefix=Juicer
3 | liftover_agp=out_JBAT.liftover.agp
4 | contig_genome=YZ.keep.fa
5 | 
6 | /dellfsqd2/ST_OCEAN/USER/lichen2/00_software/yahs/juicer post -o ${out_final_assembly_prefix} ${edited_assembly} ${liftover_agp} ${contig_genome}
7 | 


--------------------------------------------------------------------------------
/genome/Hic/yahs/step2_juicer_post.sh:
--------------------------------------------------------------------------------
1 | edited_assembly=out_YZ.review.assembly
2 | out_final_assembly_prefix=out_YZ.review.final
3 | liftover_agp=out_YZ.liftover.agp
4 | contig_genome=YZ.asm.hic.p_ctg.fasta
5 | 
6 | /dellfsqd2/ST_OCEAN/USER/lichen2/00_software/yahs/juicer post -o ${out_final_assembly_prefix} ${edited_assembly} ${liftover_agp} ${contig_genome}
7 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/step3.sh:
--------------------------------------------------------------------------------
1 | ## sort the tree node
2 | ## 0 for decreasing     1 for increasing
3 | 
4 | tree_deal.py -l 0 raw.tree.nwk > step3.raw.tree.sort.decreasing.nwk
5 | tree_deal.py -l 1 raw.tree.nwk > step3.raw.tree.sort.increasing.nwk
6 | tree_deal.py -s step3.raw.tree.sort.increasing.nwk > step3.increasing.txt
7 | tree_deal.py -s step3.raw.tree.sort.decreasing.nwk > step3.decreasing.txt
8 | 


--------------------------------------------------------------------------------
/genome/TE/NeuralTE.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source activate /micromamba/envs/NeuralTE
 3 | 
 4 | input=TIR.fa
 5 | outdir=$PWD/out_NeuralTE
 6 | threads_num=10
 7 | 
 8 | python /01_soft/NeuralTE-master/src/Classifier.py \
 9 |  --data ${input} \
10 |  --model_path /01_soft/NeuralTE-master/models/NeuralTE_model.h5 \
11 |  --outdir ${outdir} \
12 |  --use_gpu_num 0 \
13 |  --is_plant 0 \
14 |  --thread ${threads_num}
15 | 


--------------------------------------------------------------------------------
/picture/heatscatter2/heatscatter.r:
--------------------------------------------------------------------------------
 1 | library(xbox)
 2 | args <- commandArgs (T)
 3 | 
 4 | rawcount <- read.table(args[1], header = T, sep="\t")
 5 | pdf(paste(args[1], ".pdf", sep = "", collapse = ""))
 6 | 
 7 | heatpoint(rawcount[,1],rawcount[,2]) -> dat_result
 8 | str(dat_result)
 9 | 
10 | head(dat_result$plot.data)
11 | data.frame(dat_result$cor.result)
12 | str(dat_result$lm.result)
13 | xplot(dat_result)
14 | 


--------------------------------------------------------------------------------
/genome/quick_merge_genome/quickmerge.sh:
--------------------------------------------------------------------------------
 1 | ## 这种merge方法更有利于对不同数据组装出来的基因组进行merge，以达到对所有的数据都利用起来的效果；
 2 | ## 假如所有的组装方法都使用了相同的数据那么效果将不明显。
 3 | 
 4 | ref_genome=$1
 5 | qur_genome=$2
 6 | threads=$3
 7 | 
 8 | nucmer -t ${threads} -l 100 --mum -p nd ${ref_genome} ${qur_genome}
 9 | delta-filter -r -q -l 10000 nd.delta > nd.rq.delta
10 | quickmerge -d nd.rq.delta -q ${qur_genome} -r ${ref_genome} -hco 5.0 -c 1.5 -l 1600000 -ml 10000 -p nd
11 | 


--------------------------------------------------------------------------------
/genome/Hic/haphic/split_haphic_step4.sh:
--------------------------------------------------------------------------------
 1 | ### https://github.com/zengxiaofei/HapHiC
 2 | 
 3 | micromamba activate haphic
 4 | 
 5 | raw_genome=out_correct.fa
 6 | haphic_corrected_genome=corrected_asm.fa
 7 | cpu=50
 8 | nchrs=23
 9 | gap_size=500
10 | prefix=haphic
11 | 
12 | haphic build ${haphic_corrected_genome} ${raw_genome} HiC.filtered.bam final_tours/group*.tour --corrected_ctgs corrected_ctgs.txt --Ns ${gap_size} --prefix ${prefix}
13 | 


--------------------------------------------------------------------------------
/Comparative_genomics/Signal_peptide/Razor.sh:
--------------------------------------------------------------------------------
 1 | ## https://github.com/Gardner-BinfLab/Razor
 2 | #micromamba activate py36
 3 | 
 4 | source activate /home_micromamba/envs/py36
 5 | 
 6 | input_pep=ABCD.pep
 7 | output_txt=Razor
 8 | cpu=10
 9 | max_scan_length=80
10 | 
11 | [ -d libs ] || ln -s /01_soft/Razor/libs/
12 | 
13 | python /01_soft/Razor/razor.py --fastafile ${input_pep} --output ${output_txt} --maxscan ${max_scan_length} --ncores ${cpu}
14 | 
15 | rm libs
16 | 


--------------------------------------------------------------------------------
/genome/Mit_genome/mitoz.sh:
--------------------------------------------------------------------------------
 1 | source activate /01_software/miniconda3/envs/mitozEnv
 2 | 
 3 | mitoz all  \
 4 | --outprefix qingyi \
 5 | --thread_number 20 \
 6 | --clade Chordata \
 7 | --species_name "Choerodon_schoenleinii" \
 8 | --workdir AUTOPL2111250159 \
 9 | --fq1 ABCD_1.clean.fq.gz \
10 | --fq2 ABCD_2.clean.fq.gz \
11 | --fastq_read_length 150 \
12 | --data_size_for_mt_assembly 2 \
13 | --assembler megahit \
14 | --memory 50 \
15 | --requiring_taxa Chordata
16 | 


--------------------------------------------------------------------------------
/deal_gff/gene_rename/change.name.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | use strict;
 3 | @ARGV||die "Usage: perl $0 <gff> <name> > rename.gff\n";
 4 | my ($file,$name)=@ARGV;
 5 | open IN,shift;
 6 | my $num='00000';
 7 | while(<IN>){
 8 | 	chomp;
 9 | 	my @a=split;
10 | 	if($a[2] eq "mRNA"){
11 | 	$num++;
12 | 	$a[8]="ID=$name$num;";
13 | 	print join "\t",@a;print "\n";
14 | 	}
15 | 	else{
16 | 	$a[8]="Parent=$name$num;";
17 | 	print join "\t",@a;print "\n";
18 | 	}
19 | }
20 | close IN;
21 | 


--------------------------------------------------------------------------------
/Comparative_genomics/gene_family/05_phylogenetic.sh:
--------------------------------------------------------------------------------
1 | muscle -align ./1_identify_gene_family/04_final_gene_family/final_gene_protein -output ./2_phylogenetic/gene_protein.muscle 
2 | trimal -in ./2_phylogenetic/gene_protein.muscle -out ./2_phylogenetic/gene_protein.muscle.trimal -automated1 
3 | grep '^>' ./2_phylogenetic/gene_protein.muscle | sed 's/>//' > ./2_phylogenetic/gene_protein.muscle.name 
4 | iqtree2 -s ./2_phylogenetic/gene_protein.muscle.trimal -m MFP -nt AUTO -B 1000 > ./2_phylogenetic/iqtree
5 | 


--------------------------------------------------------------------------------
/genome/puerge_halp/kmerdup/bowtie2_demo.sh:
--------------------------------------------------------------------------------
 1 | export PATH="/01_soft/kmerDedup/:$PATH"
 2 | 
 3 | prefix=YZ
 4 | cpu=5
 5 | work_dir=$PWD/..
 6 | input_dir=${work_dir}/split
 7 | output_dir=${work_dir}/mapping
 8 | 
 9 | [ -d ${output_dir} ] || mkdir ${output_dir}
10 | bowtie2 --very-sensitive -k 1000 --score-min L,-0.6,-0.2 --end-to-end --reorder -L 21 --rg-id ${prefix} --rg SM:${prefix} -p ${cpu} -f ${input_dir}/ABCD -x ${work_dir}/${prefix}.format | samtools view -@ ${cpu} -F 4 -bS - > ${output_dir}/ABCD.bam
11 | 


--------------------------------------------------------------------------------
/genome/Hic/haphic/split_haphic_step1.sh:
--------------------------------------------------------------------------------
 1 | ### https://github.com/zengxiaofei/HapHiC
 2 | 
 3 | micromamba activate haphic
 4 | 
 5 | genome=out_correct.fa
 6 | cpu=80
 7 | nchrs=23
 8 | inflation_step=0.1
 9 | min_inflation=1
10 | max_inflation=1.5
11 | 
12 | haphic cluster --remove_concentrated_links --remove_allelic_links 2 --threads ${cpu} --correct_nrounds 2 --correct_resolution 250 ${genome} HiC.filtered.bam ${nchrs} --inflation_step ${inflation_step} --min_inflation ${min_inflation}  --max_inflation ${max_inflation}
13 | 


--------------------------------------------------------------------------------
/genome/Hic/haphic/split_haphic_step2.sh:
--------------------------------------------------------------------------------
 1 | ### https://github.com/zengxiaofei/HapHiC
 2 | 
 3 | micromamba activate haphic
 4 | 
 5 | genome=corrected_asm.fa
 6 | cpu=80
 7 | nchrs=23
 8 | best_inflation=1.5
 9 | ambiguous_cutoff=0.4
10 | remove_allelic_links=2
11 | 
12 | haphic reassign --ambiguous_cutoff ${ambiguous_cutoff} --remove_allelic_links ${remove_allelic_links} --nclusters ${nchrs} --threads ${cpu} ${genome} full_links.pkl inflation_${best_inflation}/mcl_inflation_${best_inflation}.clusters.txt paired_links.clm
13 | 


--------------------------------------------------------------------------------
/genome/puerge_halp/purge_haplotigs/step2_custom_set.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #https://blog.csdn.net/u012110870/article/details/100171429
 3 | 
 4 | micromamba activate purge_haplotigs
 5 | 
 6 | genome=YZ_CRAQ.fa
 7 | input=hifi_aln_sorted.bam.200.gencov
 8 | low_cut=0
 9 | mid_cut=90
10 | high_cut=200
11 | cpu=70
12 | 
13 | purge_haplotigs contigcov -i ${input} -o coverage_stats.csv  -l ${low_cut}  -m ${mid_cut} -h ${high_cut}
14 | purge_haplotigs purge -g ${genome} -c coverage_stats.csv -b hifi_aln_sorted.bam -t $cpu -a 60 -v -d
15 | 


--------------------------------------------------------------------------------
/picture/QQplot/QQplot.R:
--------------------------------------------------------------------------------
 1 | library(CMplot)
 2 | library(qqman)
 3 | 
 4 | args <- commandArgs (T)
 5 | 
 6 | results_log <- read.table(args[1], header=T)
 7 | p_value=results_log$P
 8 | z = qnorm(p_value/ 2)
 9 | lambda = round(median(z^2, na.rm = TRUE) / 0.454, 3)
10 | lambda
11 | 
12 | pdf(args[2], width = 6, height = 6)
13 | 
14 | CMplot(results_log, plot.type = "q", threshold = 0.05, signal.cex=0.5, conf.int.col="grey", file="jpg", dpi=600, file.name=args[2], file.output=TRUE, verbose=F,cex=c(0.3,0.3))
15 | 
16 | dev.off()
17 | 


--------------------------------------------------------------------------------
/picture/line/line.R:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | args <- commandArgs (T)
 3 | 
 4 | # 定义函数
 5 | plot_line_chart <- function(input_file, output_file, pos, data_column) {
 6 |   # 读取数据
 7 |   data <- read.table(input_file, header = TRUE)
 8 | 
 9 |   # 绘制折线图
10 |   pdf(output_file, width = 15, height = 6)
11 | 
12 |   ggplot(data, aes_string(x = pos, y = data_column)) + geom_line() + theme_minimal()
13 | }
14 | 
15 | # 调用函数并绘制折线图
16 | plot_line_chart(input_file = args[1], output_file = args[2], pos = args[3], data_column = args[4])
17 | 


--------------------------------------------------------------------------------
/genome/puerge_halp/purge_haplotigs/step1_map.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #source activate home_micromamba/envs/purge_haplotigs/
 4 | 
 5 | genome=YZ_CRAQ.fa
 6 | ccs_fa=yz.fasta.gz
 7 | cpu=70
 8 | 
 9 | minimap2 -ax map-hifi -t $cpu $genome $ccs_fa --secondary=no -o hifi_aln.sam
10 | samtools faidx ${genome}
11 | samtools view -@ $cpu -t ${genome}.fai -bS hifi_aln.sam -o hifi_aln.bam
12 | samtools sort -t $cpu hifi_aln.bam > hifi_aln_sorted.bam
13 | rm hifi_aln.sam hifi_aln.bam
14 | 
15 | purge_haplotigs readhist -b hifi_aln_sorted.bam -g $genome -t ${cpu}
16 | 


--------------------------------------------------------------------------------
/genome/Anno_homology/Spaln/Spaln.sh:
--------------------------------------------------------------------------------
 1 | genome=ABCD.fa
 2 | pep=all.pep.rmdup.fa
 3 | out=spaln.gff
 4 | cpu=10
 5 | db=$PWD/seqdb/genome
 6 | sif=/singularity_all/spaln3.sif
 7 | 
 8 | [ -d seqdb ] || mkdir seqdb
 9 | cp ${genome} seqdb/genome.gf
10 | 
11 | cd seqdb
12 | /usr/bin/singularity run --bind $PWD/:$PWD/ ${sif} /spaln_data/bin/spaln -W -KP -g genome.gf
13 | cd ../
14 | 
15 | /usr/bin/singularity run --bind $PWD/:$PWD/ ${sif} /spaln_data/bin/spaln -Q7 -LS -pw -S3 -O0 -pi -yE 10 -yL 30 -t ${cpu} -D ${db} ${pep} > ${out} 2> Log.spaln
16 | 
17 | rm -rf $PWD/seqdb
18 | 


--------------------------------------------------------------------------------
/picture/Manhattan/qqman_qq_mhd.r:
--------------------------------------------------------------------------------
 1 | library(qqman)
 2 | args <- commandArgs(T)
 3 | 
 4 | results_log <- read.table(args[1], header=T)
 5 | p_value=results_log$P
 6 | z = qnorm(p_value/ 2)
 7 | lambda = round(median(z^2, na.rm = TRUE) / 0.454, 3)
 8 | lambda
 9 | 
10 | jpeg(paste(args[2], ".jpeg", sep = "", collapse = ""))
11 | 
12 | qq(results_log$P, main = "Q-Q plot of GWAS p-values : log", xlim = c(0, 7),  ylim = c(0, 12), pch = 18, col = "blue4", cex = 0.5, las = 1)
13 | manhattan(results_log,chr="CHR",bp="BP",p="P",snp="SNP", main = "Manhattan plot")
14 | 
15 | dev.off()
16 | 


--------------------------------------------------------------------------------
/Comparative_genomics/gene_family_cluster/sonicparanoid.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source activate /micromamba/envs/sonicparanoid
 4 | 
 5 | cwd=$PWD
 6 | indir=./input
 7 | outdir=$PWD/result
 8 | cpu=35
 9 | prefix=fish51
10 | inflation=2
11 | MIN_BITSCORE=100
12 | 
13 | sonicparanoid -i $indir -o $outdir -p $prefix -t $cpu -m sensitive -I $inflation -op -bs $MIN_BITSCORE
14 | 
15 | #mv $outdir/runs/$prefix/* $outdir
16 | #mv $outdir/ortholog_groups $outdir/$prefix
17 | 
18 | ### remove tmp filei !!!!!!!!!!!!!!!!!!!!!
19 | #rm -rf $outdir/runs $outdir/orthologs_db $outdir/alignments
20 | 


--------------------------------------------------------------------------------
/genome/Mit_genome/fa2gb.py:
--------------------------------------------------------------------------------
 1 | #!/01_software/miniconda3/bin/python3
 2 | 
 3 | import sys
 4 | import getopt
 5 | sys.path.append('/01_software/miniconda3/lib/python3.7/site-packages/')
 6 | from Bio import SeqIO
 7 | 
 8 | input_handle = open(sys.argv[1], "r")
 9 | output_handle = open(sys.argv[2], "w")
10 | 
11 | sequences = list(SeqIO.parse(input_handle, "fasta"))
12 | 
13 | # assign molecule type
14 | for seq in sequences:
15 |   seq.annotations['molecule_type'] = 'DNA'
16 | 
17 | count = SeqIO.write(sequences, output_handle, "genbank")
18 | 
19 | output_handle.close()
20 | input_handle.close()
21 | 


--------------------------------------------------------------------------------
/genome/Hic/haphic/bwa.sh:
--------------------------------------------------------------------------------
 1 | ### https://github.com/zengxiaofei/HapHiC
 2 | 
 3 | micromamba activate haphic
 4 | export PATH="/01_soft/HapHiC/:$PATH"
 5 | export PATH="/01_soft/HapHiC/utils/:$PATH"
 6 | 
 7 | genome=$PWD/chr.fa
 8 | hic1=$PWD/hic_getreads.R1.fq.gz
 9 | hic2=$PWD/hic_getreads.R2.fq.gz
10 | cpu=60
11 | 
12 | bwa index ${genome}
13 | bwa mem -5SP -t ${cpu} ${genome} ${hic1} ${hic2} | samblaster | samtools view - -@ ${cpu} -S -h -b -F 3340 -o HiC.bam
14 | filter_bam HiC.bam 1 --nm 3 --threads ${cpu} | samtools view - -b -@ ${cpu} -o HiC.filtered.bam
15 | 
16 | mock_agp_file.py ${genome} > ${genome}.agp
17 | 


--------------------------------------------------------------------------------
/Comparative_genomics/Signal_peptide/DeepSig.sh:
--------------------------------------------------------------------------------
 1 | ### https://github.com/BolognaBiocomp/deepsig
 2 | # micromamba activate DeepPeptide
 3 | 
 4 | source activate /home_micromamba/envs/DeepPeptide
 5 | export DEEPSIG_ROOT=/01_software/deepsig
 6 | 
 7 | input_pep=FBG.correct.gff.pep
 8 | output_txt=FBG.correct.gff.pep.deepsig
 9 | organism=euk                ## euk;gramp,gramn    The organism the sequences belongs to.
10 | outfmt=gff3                 ## json;gff3
11 | 
12 | /usr/bin/singularity exec --bind $PWD/:$PWD/ /01_soft/singularity_all/deepsig.sif deepsig.py -f ${input_pep} -o ${output_txt} -k ${organism} -m ${outfmt}
13 | 


--------------------------------------------------------------------------------
/genome/Anno_integrate/alignAssembly.config:
--------------------------------------------------------------------------------
 1 | 
 2 | ## templated variables to be replaced exist as <__var_name__>
 3 | 
 4 | # database settings
 5 | DATABASE=/workdir/new.db
 6 | 
 7 | #######################################################
 8 | # Parameters to specify to specific scripts in pipeline
 9 | # create a key = "script_name" + ":" + "parameter" 
10 | # assign a value as done above.
11 | 
12 | #script validate_alignments_in_db.dbi
13 | validate_alignments_in_db.dbi:--MIN_PERCENT_ALIGNED=80
14 | validate_alignments_in_db.dbi:--MIN_AVG_PER_ID=80
15 | 
16 | #script subcluster_builder.dbi
17 | subcluster_builder.dbi:-m=50
18 | 


--------------------------------------------------------------------------------
/genome/assess/omark.sh:
--------------------------------------------------------------------------------
 1 | #https://github.com/DessimozLab/OMArk
 2 | ### Attention!!! ###
 3 | ## before run OMArk, mkdir .etetoolkit in your home dir.
 4 | 
 5 | input=EFGH
 6 | outdir=omark_assess
 7 | database=/06_database/OMArk_database/LUCA.h5
 8 | 
 9 | [ -d ${outdir} ] || mkdir -p ${outdir}
10 | 
11 | ##  source activate /micromamba/envs/OMArk
12 | # omamer -h
13 | # omark -h
14 | 
15 | /01_soft/mambaforge/bin/micromamba run -n OMArk omamer search --db ${database} --query ${input} --out ${outdir}/${input}.omamer
16 | /01_soft/mambaforge/bin/micromamba run -n OMArk omark -f ${outdir}/${input}.omamer -d ${database} -o ${outdir}
17 | 


--------------------------------------------------------------------------------
/genome/Anno_homology/gth/gth.sh:
--------------------------------------------------------------------------------
 1 | genome=ABCD.fa
 2 | pep=pep.fa
 3 | outgff=GTH_predict.gff
 4 | 
 5 | gth -intermediate -gff3out -genomic ${genome} -protein ${pep} > ${outgff}
 6 | rm ${genome}.dna.*  ${pep}.protein.*  *md5
 7 | 
 8 | awk '$3=="gene" || $3=="exon"' GTH_predict.gff | sed 's/exon/CDS/g;s/gene/mRNA/g' | awk -F ";" '{print $1";"}' > GTH_predict.bgi.gff
 9 | grep -v "#" GTH_predict.bgi.gff | gffread -C -G -K -Q -Y -M --cset -d dup -H -V -P -N -Z - -g ${genome} -o gth.deal.gff
10 | awk '$3=="mRNA" || $3=="CDS"' gth.deal.gff | awk -F ";" '{print $1";"}' > gth.deal.bgi.gff
11 | Covert_for_evm.pl gth.deal.bgi.gff gth > gth.gff.forevm.gff3
12 | 


--------------------------------------------------------------------------------
/genome/Anno_EviAnn/EviAnn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | genome=Stichopus_variegatus.fa
 4 | trans=transcripts.fa
 5 | pep=proteins.faa
 6 | cpu=15
 7 | 
 8 | /01_software/EviAnn-2.0.2/bin/eviann.sh -t $cpu -g $genome -e $PWD/$trans -p $PWD/$pep --partial --debug -l
 9 | 
10 | [ -d out_final ] || mkdir out_final
11 | mv ${genome}.pseudo_label.gff out_final
12 | mv ${genome}.transcripts.fasta out_final
13 | mv ${genome}.proteins.fasta out_final
14 | cp *.sh.o* out_final/log.txt
15 | 
16 | #rm ${genome}.* tissue* *stringtie*.sh *success broken* makeblastdb.out blastp2.out combine.out makeblastdb.sex2mex.out blastp5.out proteins.faa.uniq miniprot.err check_cds.out
17 | 


--------------------------------------------------------------------------------
/genome/Hic/haphic/split_haphic_step0.sh:
--------------------------------------------------------------------------------
 1 | ### https://github.com/zengxiaofei/HapHiC
 2 | 
 3 | micromamba activate haphic
 4 | 
 5 | genome=merge.hap.fa
 6 | cpu=70
 7 | nchrs=23
 8 | 
 9 | # HapHiC will ignore the parameter "nchrs", it can be any integer
10 |  haphic pipeline ${genome} HiC.filtered.bam ${nchrs} --quick_view
11 | 
12 | # Correct input contigs before a quick view
13 | #haphic pipeline ${genome} HiC.filtered.bam ${nchrs} --quick_view --correct_nrounds 2
14 | 
15 | # Partition contigs into different haplotypes in quick view mode
16 | # haphic pipeline ${genome} HiC.filtered.bam ${nchrs} --quick_view --gfa "XXX.hap1.p_ctg.gfa,XXX.hap2.p_ctg.gfa" --correct_nrounds 2
17 | 


--------------------------------------------------------------------------------
/Comparative_genomics/short_Peptide/short_Peptide_predict:
--------------------------------------------------------------------------------
 1 | source activate /home_micromamba/envs/DeepPeptide
 2 | 
 3 | ###
 4 | ln -s /01_soft/DeepPeptide/predictor/* ./
 5 | 
 6 | fasta=FBG.correct.gff.pep
 7 | outdir=DeepPeptide
 8 | batch_size=100	## bigger batch_size, bigger memory
 9 | 
10 | mkdir ${outdir}
11 | 
12 | python /01_soft/DeepPeptide/predictor/predict.py --fastafile ${fasta} --output_dir ${outdir} --esm esm2 --esm_pt /01_soft/DeepPeptide/predictor/checkpoints/esm2_t33_650M_UR50D.pt
13 | #python /01_soft/DeepPeptide/predictor/predict.py --fastafile ${fasta} --output_dir ${outdir} --esm esm1b --esm_pt /01_soft/DeepPeptide/predictor/checkpoints/esm1b_t33_650M_UR50S.pt
14 | 


--------------------------------------------------------------------------------
/genome/Segmental_duplication/biser.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set +o posix
 3 | set -eo pipefail
 4 | 
 5 | source activate /01_software/mamba/envs/biser 
 6 | 
 7 | genome=Strongylocentrotus_purpuratus.fa
 8 | output=Strongylocentrotus_purpuratus
 9 | cpu=1
10 | tempdir=$PWD/temp
11 | 
12 | samtools faidx hardmask.fa
13 | biser --gc-heap 2G --hard --threads ${cpu} --output ${output}.SD.bed --keep-contigs --keep-temp --temp ${tempdir} hardmask.fa
14 | #biser --resume ${tempdir}/biser.XXXXXXXX(change here) --gc-heap 2G --hard --threads ${cpu} --output ${output} --keep-contigs --keep-temp --no-decomposition --temp ${tempdir} hardmask.fa
15 | rm hardmask.fa HiTE.gff HiTE.bed hardmask.fa.fai
16 | 


--------------------------------------------------------------------------------
/picture/Normality.Test2/Normality.Test2.R:
--------------------------------------------------------------------------------
 1 | library("ggpubr")
 2 | args <- commandArgs (T)
 3 | 
 4 | indata <- read.table(args[1], header=T, sep="\t", quote="")
 5 | 
 6 | # 定义函数
 7 |   Normality_test <- function(input_file, type) {
 8 | 
 9 |   indata <- read.table(input_file, header=T, sep="\t", quote="")
10 | 
11 |   #pdf(output_file, width = 7, height = 6)
12 | 
13 |   #ggdensity(indata, x= type,  main = "Density")
14 | 
15 |   #ggqqplot(indata, x= type)   ### color = group , palette = c("#00AFBB", "#E7B800"))
16 | 
17 |   expression_to_eval <- paste0("indata$", type)
18 |   shapiro.test(eval(parse(text = expression_to_eval)))
19 | 
20 | }
21 | 
22 | Normality_test(input_file = args[1], type = args[2])
23 | 


--------------------------------------------------------------------------------
/genome/Anno_RNA/minimap2/minimap2.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | genome=HCS_chr.fa
 3 | trans=trans.fa
 4 | cpu=40
 5 | 
 6 | minimap2 -ax splice:hq -uf ${genome} ${trans} -t ${cpu} > aln.sam
 7 | sam2gff.pl aln.sam > aln.sam.gff3
 8 | awk '{print $0";"}' aln.sam.gff3 | sed 's/exon/CDS/g' > aln.sam.gff3.tmp
 9 | 
10 | ##
11 | export PERL5LIB=/01_software/TransDecoder-TransDecoder-v5.5.0/PerlLib:$PERL5LIB
12 | sed 's/exon/CDS/g' aln.sam.gff3 > tmp.gff
13 | gff2gtf_v2.pl tmp.gff aln.sam.gtf
14 | rm tmp.gff
15 | perl /01_software/TransDecoder-TransDecoder-v5.5.0/util/gtf_genome_to_cdna_fasta.pl aln.sam.gtf ${genome} > transcripts.fasta
16 | /01_software/TransDecoder-TransDecoder-v5.5.0/TransDecoder.LongOrfs -m 50 -t transcripts.fasta
17 | 


--------------------------------------------------------------------------------
/genome/Telomere/Telomere_tidk.sh:
--------------------------------------------------------------------------------
 1 | genome=Stichopus_variegatus.fa
 2 | out_dir=test1
 3 | 
 4 | ## download tidk_database.csv to ~/.local/share/tidk
 5 | #/software/miniconda3/bin/tidk build
 6 | 
 7 | [ -d ${out_dir} ] || mkdir ${out_dir}
 8 | 
 9 | ## explore
10 | /software/miniconda3/bin/tidk explore --distance 0.05 --minimum 5 --maximum 7 -t 30 ${genome} > ${out_dir}/candicate_TR_unit.tsv
11 | 
12 | ## find
13 | for i in $(cat ${out_dir}/candicate_TR_unit.tsv | sed '1d' | awk '{print $1}'); do /software/miniconda3/bin/tidk search -s ${i} -o find_${i} -d ${out_dir} -w 50000 ${genome} ; done
14 | 
15 | ## check
16 | for i in $(ls ${out_dir}/*_windows.tsv) ; do awk '$3>50 || $4>50' ${i} > ${i}.check.tsv ; done
17 | 


--------------------------------------------------------------------------------
/picture/Manhattan/Manhattan.R:
--------------------------------------------------------------------------------
 1 | library(CMplot)
 2 | args <- commandArgs (T)
 3 | 
 4 | indata <- read.table(args[1], header=T, sep="\t", quote="")
 5 | #pdf(args[2], width = 15, height = 6)
 6 | 
 7 | CMplot(indata, plot.type="m",
 8 | #        col=c("grey30","grey60"),
 9 |         LOG10=T,
10 |         ylim=c(0,10),
11 |         cex=c(0.0001,0.0001),
12 |         threshold=c(as.numeric(args[3]),as.numeric(args[4])),
13 |         threshold.lty=c(1,2), threshold.lwd=c(1,1), threshold.col=c("black","grey"),
14 |         amplify=T, chr.den.col=NULL,
15 |         signal.col=c("red","green"), signal.cex=c(0.5,0.5), signal.pch=c(19,19),
16 |         file="jpg", dpi=600, file.name=args[2], file.output=TRUE, verbose=F)
17 | 


--------------------------------------------------------------------------------
/Comparative_genomics/gene_family_cluster/orthofinder.sh:
--------------------------------------------------------------------------------
 1 | cwd=$PWD
 2 | indir=$cwd/00_data
 3 | cpu=10
 4 | prefix=Fish
 5 | ### Options: blast, mmseqs, blast_gz, diamond
 6 | software=diamond
 7 | 
 8 | /usr/bin/singularity exec orthofinder.sif orthofinder -f $indir -t $cpu -a $cpu -S $software -n $prefix -p $cwd
 9 | 
10 | ## add species / remove species
11 | # /usr/bin/singularity exec orthofinder.sif orthofinder -f $indir -t $cpu -a $cpu -S $software -n $prefix -p $cwd --fewer-files -X
12 | 
13 | ### remove tmp file !!!!!!!!!!
14 | #rm -rf $indir/Results_*/WorkingDirectory $indir/Results_*/Orthologues*/*_Trees $indir/Results_*/Orthologues*/WorkingDirectory
15 | 
16 | #mv $indir/Results_* $cwd/orthofinder_${prefix}
17 | 


--------------------------------------------------------------------------------
/genome/Anno_denovo/galba.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | Species=HCS
 3 | genome=HCS_chr.softmask.fa
 4 | pep=train.pep
 5 | thread=70
 6 | 
 7 | /usr/bin/singularity exec -B $PWD/:$PWD/ galba.sif galba.pl --species=${Species} --genome=${genome} --prot_seq=${pep} --AUGUSTUS_CONFIG_PATH=/0_soft/augustus/Augustus/config --threads ${thread}
 8 | 
 9 | gtf2gff.pl <GALBA/galba.gtf --gff3 --out=GALBA/galba.gff3
10 | grep -v -e intron -e exon  GALBA/galba.gff3 | sed 's/galba.gff3://g' | gffread  | grep -v "#" | awk -F ";" '{print $1";"}' | sed 's/AUGUSTUS/GALBA/g' > GALBA/galba.deal.gff3
11 | change.name.pl GALBA/galba.deal.gff3 GALBA_ > GALBA/galba.bgi.gff
12 | gffread GALBA/galba.bgi.gff -g ${genome} -x GALBA/galba.bgi.gff.cds -y GALBA/galba.bgi.gff.pep
13 | 


--------------------------------------------------------------------------------
/python/zscore/zscore.py:
--------------------------------------------------------------------------------
 1 | ## usage: python zscore.py input output
 2 | 
 3 | import sys
 4 | import pandas
 5 | from pandas import read_excel
 6 | from sklearn import preprocessing
 7 | 
 8 | input_file = sys.argv[1]
 9 | output_file = sys.argv[2]
10 | 
11 | dataset = pandas.read_csv(input_file, index_col=0)
12 | # dataframe to array
13 | values = dataset.values
14 | # define date type
15 | values = values.astype(float)
16 | # stat zscore
17 | data = preprocessing.scale(values)
18 | # array to datafarme
19 | df = pandas.DataFrame(data)
20 | # name columns
21 | df.columns = dataset.columns
22 | # name rows
23 | df.index = dataset.index
24 | # output file ps: three decimal places
25 | df.to_csv(output_file, float_format='%.3f', sep='\t')
26 | 


--------------------------------------------------------------------------------
/picture/box2/box.R:
--------------------------------------------------------------------------------
 1 | args <- commandArgs (T)
 2 | library(rlang)
 3 | library(ggstatsplot)
 4 | 
 5 | indata <- read.table(args[1], header=T, sep="\t", quote="")
 6 | pdf(args[2], width = 6, height = 6)
 7 | 
 8 | ggbetweenstats(
 9 |   data = indata,
10 |   x = !!sym(args[3]),
11 |   y = !!sym(args[4]),
12 |   plot.type = "boxviolin",                    ### "boxviolin" "box" "violin"
13 |   p.adjust.method  = "bonferroni",            ### "bonferroni" "fdr" "BH" "hochberg"
14 |   pairwise.comparisons = TRUE,                ### "TRUE" "False"
15 |   pairwise.display = "significant",           ### "significant" "non-significant" "everything"
16 |   type = "nonparametric"                      ### "nonparametric" "parametric" "robust" "bayes"
17 | )
18 | 


--------------------------------------------------------------------------------
/Comparative_genomics/gene_family/01_blastp.sh:
--------------------------------------------------------------------------------
1 | gffread ./00_data/C_albu.bgi.gff -g ./00_data/C_albu.fa -x ./00_data/cds.fa 
2 | seqkit translate --trim --clean ./00_data/cds.fa > ./00_data/pep.fa 
3 | makeblastdb -in ./00_data/TLR_protein.fasta -dbtype prot -out blastdb 
4 | blastp -query ./00_data/pep.fa -db blastdb  -evalue 1e-05 -seg yes -outfmt '7 qseqid qstart qend sseqid sstart send qlen slen length pident evalue' -num_threads 5 > ./1_identify_gene_family/01_blastp/blastp.txt 
5 | grep -v '#' ./1_identify_gene_family/01_blastp/blastp.txt |awk '$9/$8 > 0.8 || $9/$7 >0.8' > ./1_identify_gene_family/01_blastp/blastp.txt.filt 
6 | awk '{print $1}' ./1_identify_gene_family/01_blastp/blastp.txt.filt | sort -u > ./1_identify_gene_family/01_blastp/blast.filt.id 
7 | 


--------------------------------------------------------------------------------
/genome/evaluate_orf_cds/evaluate_orf_cds.sh:
--------------------------------------------------------------------------------
 1 | input=helixer.bgi.gff.cds
 2 | 
 3 | ## CPC2
 4 | # https://github.com/gao-lab/CPC2_standalone
 5 | /01_software/CPC/CPC2_standalone-1.0.1/bin/CPC2.py -i ${input} -o ${input}.CPC.txt
 6 | 
 7 | ## PSAURON
 8 | # https://github.com/salzberg-lab/PSAURON
 9 | # Note: internal stop codons are ignored by PSAURON. A high PSAURON score does not guarantee a sequence contains a valid ORF. This is intended behavior, as alternate frame scores are used by default to boost the power of the model.
10 | export MAMBA_EXE='/01_soft/mamba/bin/micromamba'
11 | export MAMBA_ROOT_PREFIX='/home_micromamba'
12 | micromamba activate
13 | source activate /home_micromamba/envs/psauron
14 | 
15 | psauron -i ${input} -o ${input}.PSAURON.csv
16 | 


--------------------------------------------------------------------------------
/genome/puerge_halp/kmerdup/step4_refilter.sh:
--------------------------------------------------------------------------------
 1 | export PATH="/01_soft/kmerDedup/:$PATH"
 2 | 
 3 | prefix=YZ
 4 | mer_len=19
 5 | work_dir=$PWD
 6 | bam_dir=${work_dir}/mapping
 7 | cpu=30
 8 | out_dir=kmerdedup_refilter
 9 | mpr=0.3
10 | whitelist=white.list 		## whitelist to keep
11 | blacklist=black.list		## blacklist to remove
12 | 
13 | ## -mpr  <float>  max duplication percentage  [0.3]
14 | ## -mcv  <int>    min k-mer coverage(%)       [30]
15 | ## -mode <1/2>    1:ratio only; 2:ratio * cov [2]
16 | 
17 | perl /01_soft/kmerDedup/kmerDedup/kmerDedup.pl -k ${prefix} -mpr ${mpr} -mcv 30 -kmer ${mer_len} -o ${out_dir} -f ${prefix}.format.fa -dum ${prefix}.dump.hash ./ -cov ${prefix}.cov.stat -s samtools -t ${cpu} -mode 2 -wtl ${whitelist} -bll ${blacklist}
18 | 


--------------------------------------------------------------------------------
/genome/noncoding/noncoding_predict.sh:
--------------------------------------------------------------------------------
 1 | genome=genome.fa
 2 | threads=48
 3 | 
 4 | ## miscRNA from egapx
 5 | 
 6 | ## barrnap for rRNA
 7 | /01_software/barrnap-0.9/bin/barrnap --kingdom euk --threads ${threads} ${genome} > rRNA_barrnap.gff
 8 | 
 9 | ## aragorn for tRNA
10 | #/soft/aragorn -mt -a -t -m -i ${genome} -o tRNA.tsv
11 | 
12 | ## tRNAscan_SE for tRNA
13 | export PATH=$PATH:/noncoding_soft/tRNAscan-SE-2.0/bin
14 | export PERL5LIB=/noncoding_soft/tRNAscan-SE-2.0/lib:$PERL5LIB
15 | export PATH=$PATH:/noncoding_soft/tRNAscan-SE-2.0
16 | 
17 | tRNAscan_SE_config=/noncoding_soft/tRNAscan-SE-2.0/tRNAscan-SE.conf
18 | ## for vert vertebrate
19 | tRNAscan-SE -q -o tRNA.tsv -m statistics.summary -f tRNA_secondary.structures -M vert -c ${tRNAscan_SE_config} ${genome}
20 | 


--------------------------------------------------------------------------------
/transcriptome/full_length_transcriptome/flair_analyze_NCBI_SRA_full_length_transcriptome.sh:
--------------------------------------------------------------------------------
 1 | ## https://flair.readthedocs.io/en/latest/other_ways.html
 2 | 
 3 | genome=fcs.fa
 4 | gtf=fcs.gtf
 5 | fq=SRR17056084.fastq
 6 | cpu=50
 7 | 
 8 | 
 9 | ### module numbers: align=1, correct=2, collapse=3, collapse-range=3.5, quantify=4, diffExp=5, diffSplice=6
10 | flair align -r ${fq} -g ${genome} -t ${cpu} --junction_bed out_junction.bed
11 | 
12 | flair correct -q flair.aligned.bed -g ${genome} --threads ${cpu} -f ${gtf}
13 | 
14 | [ -d temp_flair ] || mkdir temp_flair
15 | flair collapse -r ${fq} -q flair_all_corrected.bed -g ${genome} -o flair.output --temp_dir temp_flair -t ${cpu} --keep_intermediate -f ${gtf} --no_gtf_end_adjustment --max_ends 5 --check_splice --generate_map
16 | 


--------------------------------------------------------------------------------
/genome/puerge_halp/kmerdup/miniprot.sh:
--------------------------------------------------------------------------------
 1 | genome=curated.fasta
 2 | pep=YZ.pep
 3 | prefix=EFGH_                  ### prefix for IDs in GFF3
 4 | cpu=70
 5 | max_intron_size=20k            ### max intron size [200k]
 6 | splice_model=1                  ### splice model: 2=mammal, 1=general, 0=none (see Detail) [1]
 7 | weight_of_splice_penalty=1      ### weight of splice penalty; 0 to ignore splice signals [1]
 8 | 
 9 | miniprot -G ${max_intron_size} -j ${splice_model} -t ${cpu} --gff -P ${prefix} -C ${weight_of_splice_penalty} ${genome} ${pep} --outs=0.99 > EFGH.gff
10 | 
11 | grep -A 1 "##PAF" EFGH.gff | awk '$1!~/--/' | paste - - | awk '($5-$4)/$3>0.8' | awk '{print $2"\t"$3"\t"$4"\t"$5"\t"$(NF-10)"\t"$(NF-6)"\t"$(NF-5)"\t"$(NF-4)"\t"$(NF-2)}' | sed 's/;/\t/g;s/ID=//g' > filter.info
12 | 


--------------------------------------------------------------------------------
/genome/ragtag/ragtag.sh:
--------------------------------------------------------------------------------
 1 | source activate /01_software/conda/envs/ragtag/
 2 | 
 3 | # scaffold with multiple references/maps
 4 | ragtag.py scaffold -t 48 -o out_1 chr.rename.fa polish_contig.fa
 5 | ragtag.py scaffold -t 20 -o out_2 GCF_963930695.1_fLabBer1.1_genomic.fna polish_contig.fa
 6 | ragtag.py scaffold -t 20 -o out_3 GCF_963584025.1_fLabMix1.1_genomic.fna polish_contig.fa
 7 | ragtag.py scaffold -t 20 -o out_4 GCF_009762535.1_fNotCel1.pri_genomic.fna polish_contig.fa
 8 | 
 9 | ragtag.py merge out_correct.fa out_*/*.agp -o Merge1
10 | ragtag.py merge out_correct.fa out_1/*.agp  out_2/*.agp out_4/*.agp out_6/*.agp -o Merge2
11 | ragtag.py merge --gap-func max out_correct.fa out_1/*.agp  out_2/*.agp out_4/*.agp out_6/*.agp -o Merge3
12 | 
13 | ## remove other characters
14 | # perl filter.pl ragtag.scaffold.fasta > chr.fa
15 | 


--------------------------------------------------------------------------------
/picture/DensityHeatmap/huoli2.r:
--------------------------------------------------------------------------------
 1 | args <- commandArgs(T)
 2 | 
 3 | library(MASS)
 4 | library(LSD)
 5 | library(ggplot2)
 6 | library(ggthemes)
 7 | 
 8 | #pdf(paste(args[1], ".huoli2.pdf", sep = "", collapse = ""))
 9 | png(paste(args[1], ".huoli2.png", sep = "", collapse = ""))
10 | 
11 | DF <- read.table(args[1], header = F, sep="\t")
12 | 
13 | x <- DF$V1
14 | y <- DF$V2
15 | dens <- kde2d(x,y)
16 | 
17 | gr <- data.frame(with(dens, expand.grid(x,y)), as.vector(dens$z))
18 | names(gr) <- c("xgr", "ygr", "zgr")
19 | 
20 | mod <- loess(zgr~xgr*ygr, data=gr)
21 | 
22 | DF$pointdens <- predict(mod, newdata=data.frame(xgr=x, ygr=y))
23 | 
24 | p <- ggplot(DF, aes(x=x,y=y, color=pointdens)) + theme_base() + scale_colour_gradientn(colours = colorpalette('heat', 5))
25 | p <- p + geom_point()
26 | p <- p + ggtitle('heatscatter')
27 | p
28 | 


--------------------------------------------------------------------------------
/genome/TE/DeepTE.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source activate /01_soft/mamba/envs/DeepTE
 3 | 
 4 | input_fasta=$PWD/unknow.fa
 5 | species=M     # P or M or F or O. P:Plants, M:Metazoans, F:Fungi, and O: Others.
 6 | tmp_dir=DeepTE_TmpDir
 7 | output_dir=DeepTE_OutDir
 8 | Model_dir=/01_soft/DeepTE/Model/Metazoans_model   # Metazoans_model Fungi_model Others_model UNS_model
 9 | script=/01_soft/DeepTE/DeepTE.py
10 | probability_threshold=0.6
11 | 
12 | mkdir ${tmp_dir}
13 | mkdir ${output_dir}
14 | 
15 | python3 /01_soft/DeepTE/DeepTE_domain.py -d ${tmp_dir} -o ${output_dir} -i ${input_fasta} -s /01_soft/DeepTE/supfile_dir --hmmscan /01_soft/bin/hmmscan
16 | 
17 | python3 ${script} -d ${tmp_dir} -o ${output_dir} -i ${input_fasta} -sp ${species} -m_dir ${Model_dir} -prop_thr ${probability_threshold} -modify ${output_dir}/opt_te_domain_pattern.txt
18 | 


--------------------------------------------------------------------------------
/genome/ragtag/filter.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | use Bio::SeqIO;
 4 | 
 5 | ## --help
 6 | # check input
 7 | my $infile = $ARGV[0] or die "Usage: perl script.pl <input_file> [output_file]\n";
 8 | my $outfile = $ARGV[1];
 9 | 
10 | # open input
11 | my $in  = Bio::SeqIO->new(-file => $infile, -format => 'Fasta');
12 | 
13 | # output check
14 | my $out;
15 | if ($outfile) {
16 |     $out = Bio::SeqIO->new(-file => ">$outfile", -format => 'Fasta');
17 | } else {
18 |     $out = Bio::SeqIO->new(-fh => \*STDOUT, -format => 'Fasta');
19 | }
20 | 
21 | while (my $seq = $in->next_seq()) {
22 |     my $sequence = $seq->seq;
23 | 
24 |     # remove non-ATCGN
25 |     $sequence =~ s/[^ATCGNatcgn]//g;
26 | 
27 |     $seq->seq($sequence);
28 |     $out->write_seq($seq);
29 | }
30 | 
31 | $in->close();
32 | $out->close() if $outfile;
33 | 
34 | 


--------------------------------------------------------------------------------
/genome/puerge_halp/purge_dups.sh:
--------------------------------------------------------------------------------
 1 | 利用purge_dups对基因组进行去冗余操作
 2 | 1、软件安装
 3 | ----------------------------------------------
 4 | git clone https://github.com/dfguan/purge_dups.git
 5 | cd purge_dups/src && mak
 6 | -----------------------------------------------
 7 | 2、运行脚本
 8 | -----------------------------------------------
 9 | #第一步
10 | minimap2 -t 5 -xasm5 -DP assembly.fa pacbio.fa.gz  | gzip -c - > pb_aln.paf.gz
11 | pbcstat pb_aln.paf.gz
12 | calcuts PB.stat > cutoffs 2> calcults.log
13 | #第二步
14 | split_fa assembly.fa > asm.split
15 | minimap2 -t 5 -xasm5 -DP  asm.split asm.split | gzip -c - > assembly.fasta.split.self.paf.gz
16 | #第三步
17 | purge_dups 2 -T cutoffs -c PB.base.cov assembly.fasta.split.self.paf.gz > dups.bed 2> purge_dups.log
18 | #第四步
19 | get_seqs dups.bed  assembly.fa
20 | -----------------------------------------------
21 | 


--------------------------------------------------------------------------------
/genome/Segmental_duplication/biser_split.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set +o posix
 3 | set -eo pipefail
 4 | 
 5 | source activate /01_software/mamba/envs/biser
 6 | 
 7 | genome=Strongylocentrotus_purpuratus.fa
 8 | output=Strongylocentrotus_purpuratus
 9 | cpu=2
10 | tempdir=$PWD/temp
11 | 
12 | [ -d ${tempdir} ] || mkdir ${tempdir}
13 | 
14 | ## scaffold to contig and mask
15 | bedtools maskfasta -fi ${genome} -fo mask.fa -bed HiTE.bed -mc "_"
16 | scaffold_to_contig.pl softmask.fa > mask_contig.fa
17 | scaffold_to_contig.pl -out contig_coor mask.fa > softmask_contig.fa.coor
18 | seqkit replace -p "_" -r "N" -s mask_contig.fa > hardmask.fa
19 | rm mask.fa mask_contig.fa
20 | 
21 | samtools faidx hardmask.fa
22 | biser --gc-heap 2G --hard --threads ${cpu} --output ${output}.SD.bed --keep-contigs --keep-temp --temp ${tempdir} hardmask.fa
23 | rm hardmask.fa HiTE.gff HiTE.bed hardmask.fa.fai
24 | 


--------------------------------------------------------------------------------
/genome/Hic/yahs/step1_ass.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set +o posix
 3 | 
 4 | genome=YZ_CRAQ.fa
 5 | hic_fq1=YZ_clean.R1.fq.gz
 6 | hic_fq2=YZ_clean.R2.fq.gz
 7 | cpu=48
 8 | 
 9 | ### chromap map
10 | samtools faidx ${genome}
11 | chromap -i -r ${genome} -o genome.index
12 | chromap --preset hic -r ${genome} -x genome.index --remove-pcr-duplicates -1 ${hic_fq1} -2 ${hic_fq2} --SAM -o aligned.sam -t ${cpu}
13 | samtools view -bh aligned.sam | samtools sort -@ ${cpu} -n > aligned.bam
14 | rm aligned.sam
15 | 
16 | ### yahs
17 | /00_software/yahs/yahs ${genome} aligned.bam
18 | 
19 | ### juicer
20 | /00_software/yahs/juicer pre -a -o out_Juicer yahs.out.bin yahs.out_scaffolds_final.agp ${genome}.fai > Log.txt 2>&1
21 | 
22 | juicer=/00_software/juicer_tools_1.19.02.jar
23 | /01_soft/mambaforge/bin/java -Xmx36G -jar $juicer pre out_Juicer.txt out_Juicer.hic <(cat Log.txt | grep "PRE_C_SIZE" | awk '{print $2" "$3}')
24 | 


--------------------------------------------------------------------------------
/other/filter_fasta_non-ATCGN_characters.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | use Bio::SeqIO;
 4 | 
 5 | # 检查是否提供了输入文件
 6 | my $infile = $ARGV[0] or die "Usage: perl script.pl <input_file> [output_file]\n";
 7 | my $outfile = $ARGV[1];  # 可选的输出文件
 8 | 
 9 | # 打开输入文件
10 | my $in  = Bio::SeqIO->new(-file => $infile, -format => 'Fasta');
11 | 
12 | # 如果提供了输出文件名则写入文件，否则写入标准输出
13 | my $out;
14 | if ($outfile) {
15 |     $out = Bio::SeqIO->new(-file => ">$outfile", -format => 'Fasta');
16 | } else {
17 |     $out = Bio::SeqIO->new(-fh => \*STDOUT, -format => 'Fasta');
18 | }
19 | 
20 | # 遍历每一个序列
21 | while (my $seq = $in->next_seq()) {
22 |     my $sequence = $seq->seq;
23 | 
24 |     # 去除非ATCGN的字符
25 |     $sequence =~ s/[^ATCGNatcgn]//g;
26 | 
27 |     # 更新序列内容并写入文件或标准输出
28 |     $seq->seq($sequence);
29 |     $out->write_seq($seq);
30 | }
31 | 
32 | # 关闭输入句柄
33 | $in->close();
34 | $out->close() if $outfile;  # 如果写入到文件，则关闭输出句柄
35 | 


--------------------------------------------------------------------------------
/deal_fasta/filter_err_fasta/find_err_dna.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import sys
 3 | from Bio import SeqIO
 4 | 
 5 | def check_dna_sequences(fasta_file):
 6 |     # Define legal DNA/RNA character sets
 7 |     valid_chars = set("ATCGatcg")
 8 | 
 9 |     with open(fasta_file, "r") as file:
10 |         for record in SeqIO.parse(file, "fasta"):
11 |             sequence = str(record.seq)
12 |             invalid_chars = [char for char in sequence if char not in valid_chars]
13 |             if invalid_chars:
14 |                 print(f">{record.id}")
15 |                 print(f"{sequence}")
16 | #                print(f"Illegal_character: {set(invalid_chars)}")
17 | #                print("-" * 40)
18 | 
19 | if __name__ == "__main__":
20 |     if len(sys.argv) != 2:
21 |         print("Usage: python script.py <path_to_fasta_file>")
22 |         sys.exit(1)
23 | 
24 |     fasta_file_path = sys.argv[1]
25 |     check_dna_sequences(fasta_file_path)
26 | 


--------------------------------------------------------------------------------
/Comparative_genomics/blast/blast-costom-outformat.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set +o posix
 3 | set -eo pipefail
 4 | #export BLAST_USAGE_REPORT=false
 5 | 
 6 | if [[ $# == '0' ]]; then
 7 |     echo "usage: bidui target query fasta_type soft output_name cpu"
 8 |     echo "note: fasta_type: nucl; prot"
 9 |     echo "note: soft: blastn; blastp"
10 |     echo "example: bidui db.fa test.fa nucl blastn result.txt 10"
11 |     echo "         bidui db.fa test.fa prot blastp result.txt 10"
12 |     exit 1
13 | fi
14 | 
15 | target=$1
16 | query=$2
17 | fasta_type=$3
18 | soft=$4
19 | output_name=$5
20 | cpu=$6
21 | soft_path=ncbi-blast-2.13.0+/bin
22 | 
23 | ${soft_path}/makeblastdb -in ${target} -dbtype ${fasta_type} -out ./blastdb/${target} -parse_seqids
24 | ${soft_path}/${soft} -task ${soft} -db ./blastdb/${target} -query ${query} -out ${output_name} -outfmt '7 qseqid qstart qend sseqid sstart send qlen slen length pident evalue' -num_threads ${cpu}
25 | 


--------------------------------------------------------------------------------
/picture/genome_Circos/ticks.conf:
--------------------------------------------------------------------------------
 1 | # 是否显示 ticks
 2 | show_ticks = yes
 3 | # 是否显示 ticks 的 lables
 4 | show_tick_labels = yes
 5 | ## 设定 ticks
 6 | <ticks>
 7 | ## ticks 的设置
 8 | # 设定 ticks 的位置
 9 | radius = 1r
10 | # 设定 ticks 的颜色
11 | color = black
12 | # 设定 ticks 的厚度
13 | thickness = 2p
14 | # 设定 ticks' label 的值的计算。将该刻度对应位置的值 * multiplier 得到能展示到圈图上的 label 值。
15 | multiplier = 1e-6
16 | # label 值的格式化方法。%d 表示结果为整数；%f 结果为浮点数； %.1f 结果为小数点后保留1位； %.2f 结果为小数点后保留2位。
17 | format = %d
18 | ## 以下设置了 2 个 ticks，前者是小刻度，后者是大刻度。
19 | <tick>
20 | # 设置每个刻度代表的长度。若其单位为u，则必须要设置chromosomes_units参数。比如设置chromosomes_units=1000000，则如下5u表示每个刻度代表5M长度
21 | spacing = 1u
22 | # 设置 tick 的长度
23 | size = 5p
24 | </tick>
25 | <tick>
26 | spacing = 5u
27 | size = 15p
28 | # 由于设置的是大刻度，以下用于设置展示 ticks' label。
29 | show_label = yes
30 | # 设置 ticks' label 的字体大小
31 | label_size = 20p
32 | # 设置 ticks' label 离 ticks 的距离
33 | label_offset = 10p
34 | format = %d
35 | </tick>
36 | </ticks>
37 | 


--------------------------------------------------------------------------------
/deal_fasta/filter_err_fasta/find_err_pep.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import sys
 3 | from Bio import SeqIO
 4 | 
 5 | def check_protein_sequences(fasta_file):
 6 |     # Define legal protein character sets
 7 |     valid_chars = set("ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy")
 8 | 
 9 |     with open(fasta_file, "r") as file:
10 |         for record in SeqIO.parse(file, "fasta"):
11 |             sequence = str(record.seq)
12 |             invalid_chars = [char for char in sequence if char not in valid_chars]
13 |             if invalid_chars:
14 |                 print(f">{record.id}")
15 |                 print(f"{sequence}")
16 | #                print(f"Illegal_character: {set(invalid_chars)}")
17 | #                print("-" * 40)
18 | 
19 | if __name__ == "__main__":
20 |     if len(sys.argv) != 2:
21 |         print("Usage: python script.py <path_to_fasta_file>")
22 |         sys.exit(1)
23 | 
24 |     fasta_file_path = sys.argv[1]
25 |     check_protein_sequences(fasta_file_path)
26 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | The script sources are complex, and we thank the authors of some scripts in the corresponding directories.
 2 | If there are any omissions, please inform us in the issue.
 3 | 
 4 | The main creators include:
 5 | 
 6 | <table>
 7 |   
 8 |   <tr>
 9 |     <td>中文名</td>
10 |     <td>Name</td>
11 |     <td>E-mail</td>
12 |     <td>Blog / Other </td>
13 |   </tr>
14 |   
15 |   <tr>
16 |     <td>李硕</td>
17 |     <td>Biols</td>
18 |     <td>shiyeyishang@outlook.com</td>
19 |     <td>https://bioinformls.com  https://www.researchgate.net/profile/Shuo_Li37 </td>
20 |   </tr>
21 | 
22 |   <tr>
23 |     <td>十七岁天菜少年</td>
24 |     <td></td>
25 |     <td>18137879861@163.com</td>
26 |     <td></td>
27 |   </tr>
28 | 
29 |   <tr>
30 |     <td>韩圣磊</td>
31 |     <td>hanshenglei</td>
32 |     <td>17860712133@163.com</td>
33 |     <td></td>
34 |   </tr>
35 | 
36 |   <tr>
37 |     <td>越来越好</td>
38 |     <td>better</td>
39 |     <td></td>
40 |     <td></td>
41 |   </tr>
42 | 
43 | </table>
44 | 
45 | 


--------------------------------------------------------------------------------
/picture/loess_fit/loess_fit.R:
--------------------------------------------------------------------------------
 1 | # 加载必要的包
 2 | library(ggplot2)
 3 | 
 4 | args <- commandArgs(T)
 5 | 
 6 | # 定义读取数据文件的路径
 7 | input_file_path <- args[1]
 8 | output_file_path <- args[2]
 9 | 
10 | # 读取数据文件
11 | data <- read.table(input_file_path, header = FALSE, col.names = c("Dist", "Mean_r"))
12 | 
13 | # 使用 LOESS 方法进行局部加权多项式回归
14 | loess_fit <- loess(Mean_r ~ Dist, data = data, span = 0.75)
15 | 
16 | # 创建预测值
17 | data$Loess_Fit <- predict(loess_fit, newdata = data$Dist)
18 | 
19 | # 将 LOESS 拟合结果保存到 CSV 文件
20 | write.csv(data, output_file_path, row.names = FALSE)
21 | 
22 | # 绘制图表
23 | plot <- ggplot(data, aes(x = Dist, y = Mean_r)) +
24 |   geom_point(size = 1) +                           # 原始数据点
25 |   geom_line(aes(y = Loess_Fit), color = 'blue', linewidth = 1) +  # LOESS 平滑曲线
26 |   labs(title = "LD Decay with LOESS",
27 |        x = "Distance",
28 |        y = expression(Mean~r)) +
29 |   theme_minimal()
30 | 
31 | # 保存图表为 PNG 文件
32 | ggsave(args[3], plot = plot, width = 10, height = 6)
33 | 


--------------------------------------------------------------------------------
/genome/Mit_genome/check_species_by_mit_pep.sh:
--------------------------------------------------------------------------------
 1 | input_mit_pep=test.pep
 2 | cpu=48
 3 | evalue=0.0001
 4 | matrix=BLOSUM62
 5 | query_cover=30
 6 | subject_cover=30
 7 | database=mitochondrion
 8 | 
 9 | ## https://ftp.ncbi.nlm.nih.gov/refseq/release/mitochondrion/
10 | ## https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/ 
11 | ## https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/
12 | 
13 | ### index 
14 | # diamond makedb --in mitochondrion.1.protein.faa -d mitochondrion --taxonnodes nodes.dmp --taxonnames names.dmp --taxonmap prot.accession2taxid.gz
15 | 
16 | diamond blastp -d ${database} -q ${input_mit_pep} -o ${input_mit_pep}.guess_sp.txt -p ${cpu} --ultra-sensitive --evalue ${evalue} --quiet --matrix ${matrix} --masking 1 --comp-based-stats 1 --max-hsps 0 --query-cover ${query_cover} --subject-cover ${subject_cover} --outfmt 6 qseqid sseqid evalue pident staxids sscinames sphylums
17 | 
18 | sort -k 1,1 -k 4nr,4 ${input_mit_pep}.guess_sp.txt | awk '!a[$1]++{print $0}' > ${input_mit_pep}.guess_sp2.txt
19 | 


--------------------------------------------------------------------------------
/Comparative_genomics/gene_family/02_hmm.sh:
--------------------------------------------------------------------------------
1 | hmmsearch  --domtblout ./1_identify_gene_family/02_hmm/TIR.hmm.out ./00_data/TIR.hmm ./00_data/pep.fa 
2 | grep -v '#' ./1_identify_gene_family/02_hmm/TIR.hmm.out | awk '($7 + 0) < 1e-05'|cut -f1 -d  ' ' |sort -u > ./1_identify_gene_family/02_hmm/TIR.hmm_gene.id 
3 | hmmsearch  --domtblout ./1_identify_gene_family/02_hmm/LRR.hmm.out ./00_data/LRR.hmm ./00_data/pep.fa 
4 | grep -v '#' ./1_identify_gene_family/02_hmm/LRR.hmm.out | awk '($7 + 0) < 1e-05'|cut -f1 -d  ' ' |sort -u > ./1_identify_gene_family/02_hmm/LRR.hmm_gene.id 
5 | find ./1_identify_gene_family/02_hmm -name '*_gene.id' -exec cat {} \; |sort |uniq -c |awk '$1 == 2 {print $2}' > ./1_identify_gene_family/02_hmm/hmm.id 
6 | comm -12 ./1_identify_gene_family/02_hmm/hmm.id ./1_identify_gene_family/01_blastp/blast.filt.id > ./1_identify_gene_family/result/01_target_gene.id 
7 | seqkit grep -f ./1_identify_gene_family/result/01_target_gene.id ./00_data/pep.fa > ./1_identify_gene_family/result/02_target_gene.pep 
8 | 


--------------------------------------------------------------------------------
/genome/Anno_EGAPx/egapx.03.1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ## source python env with PyYAML Module in
 3 | source /01_soft/egapx/egapx/bin/activate
 4 | 
 5 | ## You can use singularity to pull the img for https://hub.docker.com/r/ncbi/egapx/tags, then put the ncbi-egapx-0.2-alpha.img in NXF_SINGULARITY_CACHEDIR
 6 | export NXF_SINGULARITY_CACHEDIR=/01_soft/egapx-0.3.1-alpha/NXF_SINGULARITY_CACHEDIR
 7 | export JAVA_HOME=/01_software/jdk-11.0.1
 8 | export TMPDIR=$PWD
 9 | 
10 | file_path=local2.yaml
11 | outdir=Output
12 | main_script=/01_soft/egapx-0.3.1-alpha/ui/egapx.py
13 | workdir=$PWD/workdir
14 | 
15 | [ -d egapx_config ] || mkdir -p egapx_config && cp /01_soft/egapx/egapx_config/singularity.config egapx_config/singularity.config
16 | 
17 | ### the cache file you can download from https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/support_data/
18 | python3 ${main_script} ${file_path} -e singularity -w ${workdir} -o ${outdir} -lc /01_soft/egapx-0.3.0-alpha/support_data
19 | 
20 | ## rm tmp
21 | rm -rf ${workdir} .nextflow
22 | 


--------------------------------------------------------------------------------
/picture/syri_plotsv/base.demo.config:
--------------------------------------------------------------------------------
 1 | /* -------------------------------------------------
 2 |  * Nextflow config file
 3 |  * -------------------------------------------------
 4 |  */
 5 | 
 6 | process {
 7 |   errorStrategy = { ( task.exitStatus == 143 || task.exitStatus == 137 ) ? 'retry' : 'finish' }
 8 |   maxRetries = 3
 9 |   maxErrors = '-1'
10 | 
11 |   withName: 'ALIGN.*' {
12 | 	  cpus = {ABCD * task.attempt }
13 | 	  memory = { EFGH.GB * task.attempt }
14 |   	time = { 24.h * task.attempt }
15 |   }
16 |   withName: 'FIXCHR.*|SYRI.*|PLOTSR.*' {
17 | 	  cpus = {1 * task.attempt }
18 | 	  memory = { 4.GB * task.attempt }
19 |   	time = { 24.h * task.attempt }
20 |   }
21 |   withName: 'SEQTK.*' {
22 | 	  cpus = {1 * task.attempt }
23 | 	  memory = { 2.GB * task.attempt }
24 |   	time = { 1.h * task.attempt }
25 |   }
26 |   /*
27 |   withName: SEQKIT_GET_LENGTH {
28 | 	  cpus = {1 * task.attempt }
29 | 	  memory = { 1.GB * task.attempt }
30 |   	time = { 30.min * task.attempt }
31 | 
32 |   }
33 |   */
34 | }
35 | 


--------------------------------------------------------------------------------
/genome/Anno_RNA/minimap2/step3.sh:
--------------------------------------------------------------------------------
 1 | export PATH=/01_software/TransDecoder-TransDecoder-v5.5.0/util/:$PATH
 2 | export PERL5LIB=/01_software/TransDecoder-TransDecoder-v5.5.0/PerlLib:$PERL5LIB
 3 | 
 4 | ###
 5 | cat pfam.qsub/pfam.1.domtblout > pfam.domtblout
 6 | for i in `seq 2 200`
 7 | do
 8 |     less pfam.qsub/pfam.${i}.domtblout | grep -v '^#' >> pfam.domtblout
 9 | done
10 | cat blast.qsub/*outfmt6 | awk '$3>60' > blastp.outfmt6
11 | 
12 | ###
13 | transcript=transcripts.fasta
14 | /01_software/TransDecoder-TransDecoder-v5.5.0/TransDecoder.Predict -t ${transcript} --retain_pfam_hits pfam.domtblout --retain_blastp_hits blastp.outfmt6
15 | 
16 | gtf_to_alignment_gff3.pl ${gtf} > ${gtf}.gff3
17 | cdna_alignment_orf_to_genome_orf.pl ${trans}.transdecoder.gff3  ${gtf}.gff3 ${trans}  > ${trans}.transdecoder.genome.gff3
18 | awk '$3=="CDS" || $3=="mRNA" {print $0";"} ' ${trans}.transdecoder.genome.gff3 > ${trans}.transdecoder.genome.gff3.tmp
19 | Covert_for_evm.pl ${trans}.transdecoder.genome.gff3.tmp TransDecoder | awk '!a[$1"\t"$4"\t"$5]++{print $0}' > ${trans}.transdecoder.genome.gff3.forevm.gff3
20 | 


--------------------------------------------------------------------------------
/genome/assess/busco-5.5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ####atttention###
 4 | 
 5 | # The odb must be odb10.(~/06_database/)
 6 | # augustus_species ~/01_software/minimamba/envs/augustus/config/species
 7 | #                  ~/06_database/specie.txt
 8 | 
 9 | ####atttention###
10 | 
11 | ### version 4.1.2
12 | #source activate ~/conda/envs/busco-4/
13 | ### version 5.3
14 | #source activate ~/miniconda3/envs/busco5
15 | ### version 5.5.0
16 | source activate ~/home_micromamba/envs/busco5.5.0
17 | 
18 | input=pep.fa
19 | cpu=5
20 | model=prot  ##trans prot geno
21 | output=BUSCO
22 | evalue=1e-03
23 | species_model=zebrafish
24 | db=actinopterygii_odb10
25 | database=/~/06_database
26 | 
27 | busco --offline -l ${database}/${db} -e ${evalue} -m ${model} -c ${cpu} -i ${input} -o ${output} --augustus_species ${species_model}
28 | 
29 | rm -rf ${output}/logs/ ${output}/run_${db}/busco_sequences/ ${output}/run_${db}/hmmer_output/ ${output}/short_summary.specific.${db}.BUSCO.json ${output}/run_${db}/short_summary.json busco_downloads
30 | mv ${output}/run_${db}/* ${output}/
31 | rm -rf ${output}/run_${db}
32 | 


--------------------------------------------------------------------------------
/genome/Anno_RNA/GMAP/map.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | reference=$PWD/reference
 3 | species=HCS
 4 | transcript=all.rename.cdhit99.trans.fa
 5 | cpu=10
 6 | min_identity=0.7
 7 | max_intronlength_middle=20000
 8 | total_intron_length=100000
 9 | output_format=gff3_gene
10 | output_name=GMAP
11 | 
12 | ### In order to handle program received signal SIGSEGV erro alone, split transcript fasta
13 | 
14 | fastaDeal.pl --cuts 100 ${transcript}
15 | ls ${transcript}.cut > id
16 | for i in $(cat id)
17 | do
18 |     echo "/gmap-2021-08-25/bin/gmap -D $reference -d ${species}_reference --min-identity ${min_identity} --canonical-mode 2 --max-intronlength-middle ${max_intronlength_middle} --totallength ${total_intron_length} -t $cpu --input-buffer-size=20 --output-buffer-size=20 --allow-close-indels=2 --tolerant --truncate --split-large-introns --suboptimal-score=0.9 -f $output_format $PWD/${transcript}.cut/${i} > $PWD/${transcript}.cut/${output_name}.${i}.gff 2> $PWD/${transcript}.cut/${output_name}.${i}.log" >> all.run.sh
19 | done
20 | 
21 | ### Do not deliver tasks in parallel
22 | qsub -cwd -l vf=10G,p=10 -binding linear:10 -q XXX -P XXX all.run.sh
23 | 


--------------------------------------------------------------------------------
/Comparative_genomics/kaks/collinearity_kaks.sh:
--------------------------------------------------------------------------------
 1 | ### https://github.com/reubwn/collinearity/tree/v1.0
 2 | export PATH="/01_soft/MCScanX/:$PATH"
 3 | export PATH="/01_soft/collinearity:$PATH"
 4 | 
 5 | pep=2.gff.pep
 6 | cds=2.gff.cds
 7 | gff=2.gff
 8 | cpu=30
 9 | 
10 | ##
11 | diamond makedb --in ${pep} -d ${pep}
12 | diamond blastp -e 1e-2 -p 8 -q ${pep} -d ${pep} -a ${pep}.vs.self
13 | diamond view -a ${pep}.vs.self.daa -o Xyz.blast
14 | awk '$3=="mRNA"' ${gff} | awk '{print $1"\t"$9"\t"$4"\t"$5}' | sed 's/ID=//g;s/;//g' > Xyz.gff
15 | 
16 | ##
17 | [ -d result ] || mkdir result
18 | cp Xyz* result
19 | MCScanX result/Xyz
20 | duplicate_gene_classifier result/Xyz
21 | 
22 | ##
23 | add_kaks_to_MCScanX.pl -i result/Xyz.collinearity -p ${pep} -c ${cds} -t ${cpu}
24 | calculate_collinearity_metric.pl -i result/Xyz.collinearity -g Xyz.gff -k result/Xyz.collinearity.kaks
25 | calculate_collinearity_breakpoints.pl -i result/Xyz.collinearity -g Xyz.gff -s result/Xyz.collinearity.score -k result/Xyz.collinearity.kaks -b
26 | calculate_collinearity_palindromes.pl -i result/Xyz.collinearity -g Xyz.gff -k result/Xyz.collinearity.kaks
27 | 


--------------------------------------------------------------------------------
/picture/synteny_circos/simpletolink.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # simple2links
 3 | 
 4 | from sys import argv
 5 | 
 6 | simple_file = argv[1]
 7 | 
 8 | ref_bed = simple_file.split(".")[0] + ".bed"
 9 | qry_bed = simple_file.split(".")[1] + ".bed"
10 | 
11 | ref_dict = {line.split("\t")[3]:line.split("\t")[0:3] for line in open(ref_bed)}
12 | qry_dict = {line.split("\t")[3]:line.split("\t")[0:3] for line in open(qry_bed)}
13 | 
14 | fo = open(simple_file + "_link.txt", "w")
15 | 
16 | for line in open(simple_file):
17 |     if line.startswith("#"):
18 |         continue
19 |     items = line.strip().split("\t") 
20 |     ref_start_gene = items[0]
21 |     ref_end_gene = items[1]
22 |     qry_start_gene = items[2]
23 |     qry_end_gene = items[3]
24 |     
25 |     ref_chr, ref_start = ref_dict[ref_start_gene][0:2]
26 |     ref_end = ref_dict[ref_end_gene][2]
27 |     qry_chr, qry_start = qry_dict[qry_start_gene][0:2]
28 |     qry_end = qry_dict[qry_end_gene][2]
29 |     
30 |     circos_input = [ref_chr, ref_start, ref_end, qry_chr, qry_start, qry_end]
31 |     fo.writelines('\t'.join(circos_input) + '\n')
32 |     
33 | fo.close()
34 | 


--------------------------------------------------------------------------------
/genome/Anno_homology/miniprot/miniprot.sh:
--------------------------------------------------------------------------------
 1 | genome=EFGH.fa
 2 | pep=pep.fa
 3 | prefix=EFGH_                  ### prefix for IDs in GFF3
 4 | cpu=20
 5 | max_intron_size=10k            ### max intron size [200k]
 6 | splice_model=1                  ### splice model: 2=mammal, 1=general, 0=none (see Detail) [1]
 7 | weight_of_splice_penalty=1      ### weight of splice penalty; 0 to ignore splice signals [1]
 8 | 
 9 | miniprot -G ${max_intron_size} -j ${splice_model} -t ${cpu} --gff -P ${prefix} -C ${weight_of_splice_penalty} ${genome} ${pep} --outs=0.99 > EFGH.gff
10 | 
11 | grep -A 1 "##PAF" EFGH.gff | awk '$1!~/--/' | paste - - | awk '($5-$4)/$3>0.9' | awk '{print $2"\t"$3"\t"$4"\t"$5"\t"$(NF-10)"\t"$(NF-6)"\t"$(NF-5)"\t"$(NF-4)"\t"$(NF-2)}' | sed 's/;/\t/g;s/ID=//g' > filter.info
12 | cat EFGH.gff | grep -v -e "^#" -e stop_codon | gffread -C -G -K -Q -Y -M --cset -d dup.log -H -V -P -N -Z - -g ${genome} -o EFGH.gff.gffread
13 | fishInWinter.pl -bf table -ff gff - EFGH.gff.gffread | awk '{print $0";"}' | sed "s/miniprot/miniprot_EFGH/1" > EFGH.gff.gffread.gff
14 | Covert_for_evm.pl EFGH.gff.gffread.gff miniprot_EFGH > EFGH.gff.gffread.gff.forevm.gff3
15 | 


--------------------------------------------------------------------------------
/genome/Anno_EGAPx/fix_mRNA_coordinate.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | ### By Sunshai (sunhai@genomic.cn)
 4 | 
 5 | use strict;
 6 | use warnings;
 7 | 
 8 | my ($gff, $out) = @ARGV;
 9 | 
10 | open FL, $gff;
11 | my %end;
12 | while (<FL>) {
13 |     chomp;
14 |     my @tmp = split;
15 |     
16 |     if ($tmp[2] eq 'CDS') {
17 |         my $id = $1 if ($tmp[8] =~ /Parent=([^;\s]+)/);
18 |         #print "$id\n";
19 |         if (!exists $end{$id}{'start'} or $tmp[3] <= $end{$id}{'start'}) {
20 |             $end{$id}{'start'} = $tmp[3];
21 |         }
22 |         if (!exists $end{$id}{'end'} or $tmp[4] >= $end{$id}{'end'}) {
23 |             $end{$id}{'end'} = $tmp[4];
24 |         };
25 |     };
26 | };
27 | close FL;
28 | 
29 | open FL, $gff;
30 | open FLS, ">$out";
31 | while (<FL>) {
32 |     chomp;
33 |     my @tmp = split;
34 |     if ($tmp[2] eq 'mRNA') {
35 |       my $id = $1 if ($tmp[8] =~ /ID=([^;\s]+)/);
36 |         next if (!exists $end{$id}{'start'});
37 |         $tmp[3] = $end{$id}{'start'};       
38 |         $tmp[4] = $end{$id}{'end'};       
39 |     };
40 |     print FLS join("\t", @tmp), "\n";
41 | };
42 | close FLS;
43 | close FL;
44 | 


--------------------------------------------------------------------------------
/genome/Anno_integrate/evm_auto.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | species=S_japo
 4 | genome=genome.fa
 5 | cpu=48
 6 | weight=weights.txt
 7 | denovo=denovo.gff
 8 | pep=pep.gff
 9 | RNA=RNA.gff
10 | repeats=all.repeat.gff
11 | segmentSize=1000000
12 | overlapSize=200000
13 | min_intron_length=20
14 | 
15 | export PERL5LIB=/01_software/TransDecoder-TransDecoder-v5.5.0/PerlLib:$PERL5LIB
16 | 
17 | cat ${pep} ${RNA} ${denovo} | grep -v "^#" | cut -f 1 | sed '/^\s*$/d' | awk '!a[$1]++' | seqtk subseq -l 100 ${genome} - > ${genome}.filter.fa
18 | 
19 | /01_software/EVidenceModeler-v2.1.0/EVidenceModeler \
20 |                    --sample_id ${species} \
21 |                    --genome ${genome}.filter.fa \
22 |                    --weights ${weight} \
23 |                    --gene_predictions ${denovo} \
24 |                    --protein_alignments ${pep} \
25 |                    --transcript_alignments ${RNA} \
26 |                    --segmentSize ${segmentSize} \
27 |                    --overlapSize ${overlapSize} \
28 |                    --CPU ${cpu} \
29 |                    --repeats ${repeats} \
30 |                    --min_intron_length ${min_intron_length}
31 | 


--------------------------------------------------------------------------------
/deal_gff/pick_longest_gene/fix_mRNA_coordinate.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | ### By Sunshai (sunhai@genomic.cn)
 4 | 
 5 | use strict;
 6 | use warnings;
 7 | 
 8 | my ($gff, $out) = @ARGV;
 9 | 
10 | open FL, $gff;
11 | my %end;
12 | while (<FL>) {
13 |     chomp;
14 |     my @tmp = split;
15 |     
16 |     if ($tmp[2] eq 'CDS') {
17 |         my $id = $1 if ($tmp[8] =~ /Parent=([^;\s]+)/);
18 |         #print "$id\n";
19 |         if (!exists $end{$id}{'start'} or $tmp[3] <= $end{$id}{'start'}) {
20 |             $end{$id}{'start'} = $tmp[3];
21 |         }
22 |         if (!exists $end{$id}{'end'} or $tmp[4] >= $end{$id}{'end'}) {
23 |             $end{$id}{'end'} = $tmp[4];
24 |         };
25 |     };
26 | };
27 | close FL;
28 | 
29 | open FL, $gff;
30 | open FLS, ">$out";
31 | while (<FL>) {
32 |     chomp;
33 |     my @tmp = split;
34 |     if ($tmp[2] eq 'mRNA') {
35 |       my $id = $1 if ($tmp[8] =~ /ID=([^;\s]+)/);
36 |         next if (!exists $end{$id}{'start'});
37 |         $tmp[3] = $end{$id}{'start'};       
38 |         $tmp[4] = $end{$id}{'end'};       
39 |     };
40 |     print FLS join("\t", @tmp), "\n";
41 | };
42 | close FLS;
43 | close FL;
44 | 


--------------------------------------------------------------------------------
/genome/Anno_RNA/minimap2/step2.sh:
--------------------------------------------------------------------------------
 1 | trans=transcripts.fasta
 2 | cpu=2
 3 | ### /06_database/SwissProt/uniprot_sprot
 4 | blastp_db=/01_genome/Jipidongwu_db/jipidongwu
 5 | pfam_db=/06_database/Pfam/Pfam-A.hmm
 6 | blastp_soft=/01_soft/ncbi-blast-2.15.0+/bin/blastp
 7 | diamond_soft=/00_tools/diamond
 8 | evalue=1e-3
 9 | hmmscan_soft=/01_soft/mambaforge/bin/hmmscan
10 | 
11 | ln -s ${trans}.transdecoder_dir/longest_orfs.pep .
12 | fastaDeal.pl --cutf 200 longest_orfs.pep
13 | 
14 | mkdir blast.qsub  pfam.qsub
15 | for i in `seq 1 200`
16 | do
17 | #echo "${blastp_soft} -query ../longest_orfs.pep.cut/longest_orfs.pep.${i} -db ${blastp_db} -max_target_seqs 1 -outfmt 6 -evalue ${evalue} -num_threads ${cpu} > blastp.${i}.outfmt6 ; echo done " > blast.qsub/blast.${i}.sh
18 | echo "diamond blastp --evalue ${evalue} --outfmt 6 -d ${blastp_db} -q ../longest_orfs.pep.cut/longest_orfs.pep.${i} -o blastp.${i}.outfmt6 --threads $cpu --max-target-seqs 5 --more-sensitive -b 0.5 " > blast.qsub/blast.${i}.sh
19 | echo "${hmmscan_soft} --cpu ${cpu} --domtblout pfam.${i}.domtblout ${pfam_db} ../longest_orfs.pep.cut/longest_orfs.pep.${i} ; echo done " > pfam.qsub/pfam.${i}.sh
20 | done
21 | 


--------------------------------------------------------------------------------
/Comparative_genomics/gene_family/04_final_gene_family.sh:
--------------------------------------------------------------------------------
1 | cp ./00_data/final_gene_id ./1_identify_gene_family/04_final_gene_family/final_gene_id 
2 | seqkit grep -f ./1_identify_gene_family/04_final_gene_family/final_gene_id ./00_data/pep.fa > ./1_identify_gene_family/04_final_gene_family/final_gene_protein 
3 | seqkit grep -f ./1_identify_gene_family/04_final_gene_family/final_gene_id ./00_data/cds.fa > ./1_identify_gene_family/04_final_gene_family/final_gene_cds 
4 | seqkit fx2tab --length --name ./1_identify_gene_family/04_final_gene_family/final_gene_cds | awk '{print $1, $2}'> ./1_identify_gene_family/04_final_gene_family/final_gene_cds_length 
5 | grep 'CDS' ./00_data/C_albu.bgi.gff| cut -f9 | cut -d ';' -f1 | cut -d '=' -f2 | sort | uniq -c|awk '{print $2, $1}' > ./1_identify_gene_family/04_final_gene_family/gene_cds_number 
6 | grep -f  ./1_identify_gene_family/04_final_gene_family/final_gene_id ./1_identify_gene_family/04_final_gene_family/gene_cds_number |sort  >./1_identify_gene_family/04_final_gene_family/final_gene_cds_number 
7 | pfam_scan.pl -fasta ./1_identify_gene_family/04_final_gene_family/final_gene_protein -dir ./Pfam/ > ./1_identify_gene_family/04_final_gene_family/final_gene_domain 
8 | 


--------------------------------------------------------------------------------
/other/outlier2.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | def remove_outliers_iqr(data_series, m=1):
 6 |     q1 = data_series.quantile(0.25)
 7 |     q3 = data_series.quantile(0.75)
 8 |     iqr = q3 - q1
 9 |     lower_bound = q1 - m * iqr
10 |     upper_bound = q3 + m * iqr
11 |     return data_series[(data_series >= lower_bound) & (data_series <= upper_bound)]
12 | 
13 | def main(input_file, output_file):
14 |     # 读取数据文件
15 |     data = pd.read_csv(input_file, sep='\t', header=None)
16 | 
17 |     # 假设第一列是样本名，第二列是数值
18 |     sample_names = data[0]
19 |     values = data[1]
20 | 
21 |     # 剔除离群值
22 |     cleaned_values = remove_outliers_iqr(values)
23 | 
24 |     # 创建一个新的DataFrame来保存结果
25 |     cleaned_data = pd.DataFrame({'Sample': sample_names, 'Value': cleaned_values})
26 | 
27 |     # 将结果保存到新的文件
28 |     cleaned_data.to_csv(output_file, sep='\t', index=False)
29 | 
30 | if __name__ == "__main__":
31 |     if len(sys.argv) != 3:
32 |         print("Usage: python script.py <input_file> <output_file>")
33 |         sys.exit(1)
34 | 
35 |     input_file_path = sys.argv[1]
36 |     output_file_path = sys.argv[2]
37 | 
38 |     main(input_file_path, output_file_path)
39 | 


--------------------------------------------------------------------------------
/picture/box/box.R:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | library(readr)
 3 | 
 4 | args <- commandArgs (T)
 5 | 
 6 | file_path <- args[1]
 7 | x_column <- args[2]
 8 | y_column <- args[3]
 9 | 
10 | # 读取数据，跳过标题行
11 | data <- read_delim(file_path, col_names = TRUE)
12 | 
13 | # 检查列是否存在
14 | if (!(x_column %in% colnames(data)) || !(y_column %in% colnames(data))) {
15 |   stop("One or both of the specified columns do not exist in the data.")
16 | }
17 | 
18 | # 选择您想要分析的两列
19 | selected_data <- data[, c(x_column, y_column)]
20 | 
21 | # 检查选择的数据
22 | #head(selected_data)
23 | 
24 | # 为渐变颜色定义颜色方案
25 | color_scheme <- colorRampPalette(c("blue", "red"))(100)  # 生成100个渐变颜色
26 | 
27 | pdf(args[4], width = 15, height = 6)
28 | 
29 | # 使用ggplot2绘制箱线图和小提琴图的组合图
30 | p <- ggplot(selected_data, aes_string(x = x_column, y = y_column)) +
31 | #  geom_violin(trim = FALSE, fill = color_scheme[1]) + # 绘制小提琴图，使用渐变颜色
32 |   stat_boxplot(geom = "errorbar",width=0.3) +
33 |   geom_boxplot(width = 0.5, fill = color_scheme[2], outlier.fill = "grey", outlier.shape = 21) + # 绘制箱线图，使用渐变颜色
34 |   labs(x = x_column, y = y_column) +
35 |   theme_classic() + # 使用简洁主题
36 | #  theme_minimal() +
37 |   theme(axis.text.x = element_text(angle = 45, hjust = 1)) # 旋转X轴标签以便阅读
38 | p
39 | 


--------------------------------------------------------------------------------
/genome/Hic/all_hic.sh:
--------------------------------------------------------------------------------
 1 | all_hic主要针对于多倍体以及高杂合度基因组进行挂载hic，以下方法是利用all_hic对二倍体简单基因组进行挂载（针对于复杂基因组之后有空进行补充）
 2 | 1、下载all_hic软件
 3 | -------------------------------------------------
 4 | $ git clone https://github.com/tangerzhang/ALLHiC
 5 | $ cd ALLHiC
 6 | $ chmod +x bin/*
 7 | $ chmod +x scripts/*
 8 | -------------------------------------------------
 9 | 2、运行 all_hic 中 ALLHiC_pip.sh ，脚本中需要依赖 samtools 和 bwa(可以在ALLHiC_pip.sh中自行修改依赖软件的路径)
10 | 注：写脚本时需要export PATH=/path/ALLHiC/scripts/:/path/ALLHiC/bin:$PATH
11 | -------------------------------------------------
12 | Usage: ALLHiC_pip.sh -r reference -1 R1.fq -2 R2.fq -k group_count [-e enzyme] [-t threads] [-b bin_size]
13 |           -r: reference genome
14 |           -1: Lib_R1.fq.gz
15 |           -2: Lib_R2.fq.gz
16 |           -k: group_count
17 |           -e: enzyme_sites (HindIII: AAGCTT; MboI: GATC), default: HindIII
18 |           -t: threads, default: 10
19 |           -b: bin_size for hic heatmap, can be divided with comma, default: 500k
20 | -------------------------------------------------
21 | 脚本报错的话就选择分部执行
22 | 一般报错原因是因为sorted.bam文件中有多余HD注释行可以用以下命令继续处理，然后再继续执行后面步骤即可：
23 | samtools view -h sorted.bam | sed -e '/@HD\tVN:1.5\tSO:unsorted\tGO:query/d' | samtools view -b -o deal_sorted.bam
24 | 


--------------------------------------------------------------------------------
/transcriptome/Enrich/make_Orgdb.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | genus=Hypophthalmichthys
 3 | species=nobilis
 4 | taxid=7965
 5 | name=lishuo
 6 | gene2path=gene2pathway.txt
 7 | 
 8 | ## EggNog annotation file and gene2pathway.txt
 9 | sed '1,4d' eggnog-result.emapper.annotations | sed 's/^#//g' | grep -v "#" | csvtk replace -t -F -f "GOs" -p "(-)" -r "NA" | csvtk replace -t -F -f "KEGG_ko" -p "(-)" -r "NA" | csvtk replace -t -F -f "KEGG_Pathway" -p "(-)" -r "NA" | csvtk replace -t -F -f "eggNOG_OGs" -p "(-)" -r "NA" > eggNog.anno.txt
10 | 
11 | /micromamba/envs/R_env/bin/Rscript AnnotationForge_20250117.R -i eggNog.anno.txt -a ${name} -m shiyeyishang@outlook.com -g ${genus} -s ${species} -d ${taxid}
12 | DB=$(ls | grep eg.db)
13 | sed -i 's/shiyeyishang\@outlook.com/lishuo \<shiyeyishang\@outlook.com\>/g' ${DB}/DESCRIPTION
14 | /dellfsqd2/ST_OCEAN/USER/lishuo11/09_test/zz_tmp/home_micromamba/envs/R_env/bin/R CMD build $DB
15 | 
16 | [ -d R_lib ] || mkdir R_lib
17 | DBgz=$(ls | grep eg.db | grep "tar.gz")
18 | /micromamba/envs/R_env/bin/R CMD INSTALL ${DBgz} --library=$PWD/R_lib
19 | 
20 | ## deal gene2pathway.txt
21 | sed '1d' ${gene2path} | csvtk -t add-header -n GID,Pathway,Name > gene2pathway_forClusterProfiler.txt
22 | 
23 | rm -rf ${DB} ${DBgz} eggNog.anno.txt
24 | 


--------------------------------------------------------------------------------
/Comparative_genomics/deal_tree_nwk/draw.r:
--------------------------------------------------------------------------------
 1 | #!/R-4.2/bin/Rscript
 2 | 
 3 | ### https://cn.bio-protocol.org/bio101/e1010674
 4 | ### https://cran.r-project.org/web/packages/ape/ape.pdf
 5 | ### choose libPaths ###
 6 | .libPaths("/R-4.2/lib/R/library/")
 7 | 
 8 | args <- commandArgs (T)
 9 | library(ape)
10 | 
11 | tree <- read.tree(args[1])
12 | pdf(paste(args[2], ".pdf", sep = "", collapse = ""), height = 15 )
13 | 
14 | par(mfrow = c(4, 2))
15 | plot(tree, type = "p", main = "phylogram with branch lengths", sub = "A", use.edge.length = TRUE, tip.color = rainbow(5)) 
16 | plot(tree, type = "p", main = "phylogram without branch lengths", sub = "B", use.edge.length = FALSE, edge.width = 1:27/2)
17 | plot(tree, type = "c", main = "cladogram", sub = "C", edge.color = rainbow(27))
18 | plot(tree, type = "f", main = "fan", sub = "D", font = 3)
19 | plot(tree, type = "u", main = "unrooted", sub = "E")
20 | plot(tree, type = "r", main = "radial", sub = "F")
21 | 
22 | plot(tree, type = "p", main = "phylogram with branch lengths", sub = "G", use.edge.length = TRUE, edge.width = 2)
23 | nodelabels(bg = "lightgray", frame = "c")
24 | 
25 | plot(tree, type = "p", main = "phylogram with branch lengths", sub = "G", use.edge.length = TRUE, edge.width = 2)
26 | edgelabels()
27 | 


--------------------------------------------------------------------------------
/Comparative_genomics/blast/reciprocal_best_hits.sh:
--------------------------------------------------------------------------------
 1 | ### https://github.com/glarue/reciprologs
 2 | ### https://rdrr.io/github/drostlab/homologr/man/diamond_reciprocal_best_hits.html
 3 | 
 4 | ### !!!!!!!!!!
 5 | ### Attention: cp pep.fa workdir/ !!!!!!! Don't link !!!!!!!! Or blast_db will be generated in raw pep.fa dir !!!!!
 6 | ### !!!!!!!!!!
 7 | 
 8 | ### mmseqs  1:1 or 1:many
 9 | mmseqs easy-rbh cse.pep dre.pep mmseqs.rbh.txt tmp --threads 5
10 | cut -f 1,2 mmseqs.rbh.txt | awk 'NR==FNR{a[$1]=$0}NR!=FNR{print $0"\t"a[$1]}' cse.gene - | csvtk -t cut -f 2,1,5 - | guanlian2 dre.gene - | csvtk -t cut -f 2,1,3,6- > mmseqs.rbh.txt.diff
11 | 
12 | ### blast_rbh.py
13 | blast_rbh.py --threads=5 -c 30 -i 50 -a prot -t blastp -o blast_rbh.tsv cse.pep dre.pep
14 | 
15 | ### reciprologs && diamond && networkx
16 | reciprologs -p 5 --chain -q 30 -o diamond.rbh.txt --one_to_one --logging cse.pep dre.pep diamondp
17 | #reciprologs -p 10 --chain -q 30 -o diamond_more.rbh.txt cse.pep dre.pep diamondp
18 | 
19 | ### reciprologs && blastp && networkx
20 | reciprologs -p 10 --chain -q 30 -o BLASTP.rbh.txt --one_to_one --logging cse.pep dre.pep blastp
21 | 
22 | ### diamond && evalue
23 | diamond_rbh.R -a cse.fa -b dre.fa -c 10 -e 1E-3 -m 5 -M ultra-sensitive -o diamond.R.rbh.csv
24 | 


--------------------------------------------------------------------------------
/Comparative_genomics/gene_family/03_miniprot.sh:
--------------------------------------------------------------------------------
1 | blastp -query ./00_data/TLR_protein.fasta -subject ./1_identify_gene_family/result/02_target_gene.pep -evalue 1e-05 -seg yes -outfmt "6 qseqid"| sort -u > ./1_identify_gene_family/03_miniprot/new_gene_protein_id 
2 | seqkit grep -f ./1_identify_gene_family/03_miniprot/new_gene_protein_id ./00_data/TLR_protein.fasta > ./1_identify_gene_family/03_miniprot/new_gene_protein_fasta 
3 | cat ./1_identify_gene_family/result/02_target_gene.pep >> ./1_identify_gene_family/03_miniprot/new_gene_protein_fasta 
4 | miniprot --gff -I ./00_data/C_albu.fa ./1_identify_gene_family/03_miniprot/new_gene_protein_fasta > ./1_identify_gene_family/03_miniprot/miniprot.gff 
5 | awk '{if($0 ~ /^##/ && $11 != "" && $12 != "" && ($11 + 0)/($12 + 0) > 0.8) {found=1; next} else {if ($0 ~ /^##/) {found = 0}}} /mRNA/ {if (found == 1) print} /CDS/ {if (found == 1) print}' ./1_identify_gene_family/03_miniprot/miniprot.gff > ./1_identify_gene_family/03_miniprot/miniprot.filt.gff 
6 | gffread -C -G -K -Q -Y -M -d dup ./1_identify_gene_family/03_miniprot/miniprot.filt.gff > ./1_identify_gene_family/03_miniprot/miniprot.filt.gff2
7 | gffread ./1_identify_gene_family/03_miniprot/miniprot.filt.gff2 -g ./00_data/C_albu.fa -x ./1_identify_gene_family/03_miniprot/pre.cds


--------------------------------------------------------------------------------
/genome/pseudogenes/step2.sh:
--------------------------------------------------------------------------------
 1 | ## https://github.com/kelkar/Discover_pseudogenes
 2 | 
 3 | protein=use.pep
 4 | cds=use.cds
 5 | genome=genome.mask.for_anno.fa.cut/ABCD
 6 | output_prefix=pse_ABCD
 7 | Percent=60
 8 | 
 9 | [ -d TMP ] || mkdir TMP
10 | 
11 | ### step1
12 | exonerate --percent ${Percent} --model protein2genome --showquerygff yes --showtargetgff yes -q ${protein} -t ${genome} --ryo "RYO\t%qi\t%ti\t%ql\t%tl\t%qal\t%qab\t%qae\t%tal\t%tab\t%tae\t%et\t%ei\t%es\t%em\t%pi\t%ps\t%g\nTransitionStart\n%V{%Pqs\t%Pts\t%Pqb\t%Pqe\t%Ptb\t%Pte\t%Pn\t%Pl\n}TransitionEnd\nTargetSeq\n%qs\nAligned Sequences\n>Q\n%qas\n>T\n%tas\nCoding Sequences\n>Q\n%qcs\n>T\n%tcs\n" > TMP/${output_prefix}.exonerate.txt
13 | 
14 | ### step2
15 | grep -P "Query:|Target:|(Query range:)|(Target range:)|(^[A-Z]\tTAA\t)|(\sframeshift\s)|(^[A-Z]\tTAG\t)|(^[A-Z]\tTGA\t)|(^\*)"  TMP/${output_prefix}.exonerate.txt > TMP/${output_prefix}.exonerate.erros.txt
16 | 
17 | ### step3
18 | perl /exonerate-2.2.0-x86_64/bin/Exonerate_to_evm_gff3.pl TMP/${output_prefix}.exonerate.txt > TMP/${output_prefix}.exonerate.gff
19 | 
20 | ### step4
21 | perl /exonerate-2.2.0-x86_64/bin/tabulate_stops_frameshifts.pl ${cds} TMP/${output_prefix}.exonerate.gff TMP/${output_prefix}.exonerate.erros.txt > TMP/${output_prefix}.pseudogenes.txt
22 | 


--------------------------------------------------------------------------------
/Comparative_genomics/gene_family_cluster/sonicparanoid2.sh:
--------------------------------------------------------------------------------
 1 | source activate /home_micromamba/envs/sonicparanoid2/
 2 | 
 3 | cwd=$PWD
 4 | indir=$cwd/data
 5 | outdir=$cwd/sonicparanoid2_out
 6 | cpu=30
 7 | prefix=3spe
 8 | MIN_ARCH_MERGING_COV=0.75   ##  When merging graph- and arch-based orhtologs consider only new-orthologs with a protein coverage greater or equal than this value.
 9 | INFLATION=1.5
10 | MIN_BITSCORE=40                 ##  Consider only alignments with bitscores above min-bitscore.
11 |                                 ##  Increasing this value can be a good idea when comparing very closely related species.
12 |                                 ##  Increasing this value will reduce the number of paralogs (and orthologs) generate.
13 |                                 ##  higher min-bitscore values reduce the execution time for all-vs-all. Default=40
14 | 
15 | sonicparanoid -i $indir -o $outdir -p $prefix -t $cpu -m sensitive -at -ka -ca -op -d --min-arch-merging-cov ${MIN_ARCH_MERGING_COV} -bs ${MIN_BITSCORE} --inflation ${INFLATION}
16 | 
17 | mv $outdir/runs/$prefix/* $outdir
18 | mv $outdir/ortholog_groups $outdir/$prefix
19 | 
20 | ### remove tmp file !!!!!!!!!!!!!!!!!!!!!
21 | rm -rf $outdir/runs $outdir/orthologs_db $outdir/alignments $outdir/arch_orthology $outdir/merged_tables
22 | 


--------------------------------------------------------------------------------
/Comparative_genomics/get_gene_infomation/deal.sh:
--------------------------------------------------------------------------------
1 | cat All.RAW.txt | sed  "s/\"//g" | grep -v "total_count: 0" | sed 's/description:/\tdescription:/1;s/,gene_id:/\tgene_id:/1;s/,protein_count:/\tprotein_count:/1;s/,symbol:/\tsymbol:/1;s/,tax_id:/\ttax_id:/1;s/,taxname:/\ttaxname:/1;s/,transcript_count:/\t/1' | cut -f 2- | sed 's/,cds:/\tcds:/1' | cut -f 1-6,8- | sed "s/:{accession_version:/:/1;s/\[//g;s/\]//g" |  sed "s/:{begin:/:/1" | sed 's/}},/\t/1;s/,end:/___/1' | sed 's/genomic_accession_version:/\tgenomic_accession_version:/1' | sed "s/:{accession_version:/:/1;s/\[//g;s/\]//g" |  sed "s/:{begin:/:/1" | sed 's/}},/\t/1;s/,end:/___/1' | sed 's/genomic_accession_version:/\tgenomic_accession_version:/1' | cut -f 1-7,9- | sed "s/:{begin:/:/1;s/,end:/___/1" | sed "s/},sequence_name/\tsequence_name/1" | sed 's/},/\t/1' | sed 's/,query:/\t/1;s/},total_count:/\t/g' |  awk -F "\t" '{print $2"\t"$(NF-1)"\t"$4"\t"$1"\t"$3"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}' | sed 's/description://g;s/symbol://g;s/gene_id://g;s/protein_count://g;s/tax_id://g;s/taxname://g;s/cds://g;s/genomic_accession_version://g;s/sequence_name://g;s/\t\t/\t/g' | sed "s/ /________/g" | csvtk add-header -t -n gene_id,pep_id,symbol,description,Alternative_splicing_count,tax_id,species,cds_infomation,chromosome_information,chromosome_information2 | sed 's/________/ /g' > All.tsv
2 | 


--------------------------------------------------------------------------------
/genome/pseudogenes/step1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set +o posix
 3 | set -eo pipefail
 4 | 
 5 | protein=use.pep
 6 | cds=use.cds
 7 | genome=HCS.fa
 8 | output_prefix=hcs
 9 | cpu=48
10 | 
11 | ## find candicate region
12 | miniprot -t ${cpu} --gff -P ${output_prefix} ${genome} ${protein} --outs=0.6 > miniprot.gff
13 | 
14 | ## make bed of candicate region and flank 100k
15 | samtools faidx ${genome}
16 | cut -f 1,2 ${genome}.fai | sort -k 1,1 > ${genome}.genome.size
17 | awk '$3=="mRNA"' miniprot.gff | cut -f 1,4,5 | sort -k 1,1 -k 2n,2 | bedtools flank -i - -g ${genome}.genome.size -b 100000 | sort -k 1,1 -k 2n,2 | bedtools merge -i - | sort -k 1,1 -k 2n,2 | bedtools complement -i - -g ${genome}.genome.size | sort -k 1,1 -k 2n,2 > miniprot.flank_100k.complement.bed
18 | 
19 | ## mask genome
20 | bedtools maskfasta -fi ${genome} -fo ${genome}.mask.fa -bed miniprot.flank_100k.complement.bed
21 | 
22 | ## split genome
23 | cut -f 1 miniprot.gff | grep -v "#" | awk '!a[$0]++' | seqtk subseq ${genome}.mask.fa - | seqkit seq -w 60 > ${genome}.mask.for_anno.fa
24 | rm ${genome}.fai ${genome}.mask.fa
25 | fastaDeal.pl --cuts 1 ${genome}.mask.for_anno.fa
26 | ls ${genome}.mask.for_anno.fa.cut/ > chr.id
27 | 
28 | ## remove tmp
29 | rm ${genome}.fai ${genome}.mask.fa miniprot.flank_100k.complement.bed ${genome}.genome.size ${genome}.mask.for_anno.fa
30 | pigz --best miniprot.gff
31 | 


--------------------------------------------------------------------------------
/genome/Anno_integrate/annotCompare.config:
--------------------------------------------------------------------------------
 1 | 
 2 | ## templated variables to be replaced exist as <__var_name__>
 3 | 
 4 | # Pathname of an SQLite database
 5 | # If the environment variable DSN_DRIVER=mysql then it is the name of a MySQL database
 6 | DATABASE=/workdir/new.db
 7 | 
 8 | #######################################################
 9 | # Parameters to specify to specific scripts in pipeline
10 | # create a key = "script_name" + ":" + "parameter" 
11 | # assign a value as done above.
12 | 
13 | 
14 | #script cDNA_annotation_comparer.dbi
15 | cDNA_annotation_comparer.dbi:--MIN_PERCENT_OVERLAP=50
16 | cDNA_annotation_comparer.dbi:--MIN_PERCENT_PROT_CODING=30
17 | cDNA_annotation_comparer.dbi:--MIN_PERID_PROT_COMPARE=60
18 | cDNA_annotation_comparer.dbi:--MIN_PERCENT_LENGTH_FL_COMPARE=60
19 | cDNA_annotation_comparer.dbi:--MIN_PERCENT_LENGTH_NONFL_COMPARE=60
20 | cDNA_annotation_comparer.dbi:--MIN_FL_ORF_SIZE=<__MIN_FL_ORF_SIZE__>
21 | cDNA_annotation_comparer.dbi:--MIN_PERCENT_ALIGN_LENGTH=50
22 | cDNA_annotation_comparer.dbi:--MIN_PERCENT_OVERLAP_GENE_REPLACE=70
23 | cDNA_annotation_comparer.dbi:--STOMP_HIGH_PERCENTAGE_OVERLAPPING_GENE=<__STOMP_HIGH_PERCENTAGE_OVERLAPPING_GENE__>
24 | cDNA_annotation_comparer.dbi:--TRUST_FL_STATUS=<__TRUST_FL_STATUS__>
25 | cDNA_annotation_comparer.dbi:--MAX_UTR_EXONS=<__MAX_UTR_EXONS__>
26 | cDNA_annotation_comparer.dbi:--GENETIC_CODE=<__GENETIC_CODE__>
27 | 
28 | 


--------------------------------------------------------------------------------
/genome/assess/CRAQ.sh:
--------------------------------------------------------------------------------
 1 | ### https://github.com/JiaoLaboratory/CRAQ.git
 2 | ### https://mp.weixin.qq.com/s/Qqj6AlgyImW9U9tTKF-3Cw
 3 | 
 4 | ### AQI > 90, reference quality; AQI from 80-90, high quality; AQI from 60-80, draft quality; and AQI < 60, low quality.
 5 | 
 6 | genome=YZ.asm.hic.p_ctg.fasta
 7 | sms_fq=yz.fastq.gz
 8 | ngs_fq=YZ-C-3_R1.fq.gz,YZ-C-3_R2.fq.gz
 9 | cpu=20
10 | outdir=Result_CRAQ
11 | 
12 | /01_soft/CRAQ/bin/craq -g ${genome} -sms ${sms_fq} -ngs ${ngs_fq} -x map-hifi --plot T --break T --thread ${cpu} --output_dir ${outdir}
13 | 
14 | ### Get user-specified regional(i.e. window=50000) AQI score.
15 | cat ${outdir}/runAQI_out/strER_out/out_final.CSE.bed ${outdir}/runAQI_out/locER_out/out_final.CRE.bed > ${outdir}/runAQI_out/CRE_CSE.bed
16 | window=50000
17 | perl /01_soft/CRAQ/src/regional_AQI.pl ${outdir}/seq.size ${window} ${window} ${outdir}/runAQI_out/CRE_CSE.bed > ${outdir}/runAQI_out/plot_AQI.out
18 | ### plot.  the scaffolds ids you want to present is ok. see CRAQcircos.py --help
19 | python /01_soft/CRAQ/src/CRAQcircos.py --genome_size ${outdir}/seq.size --genome_error_loc ${outdir}/runAQI_out/CRE_CSE.bed --genome_score ${outdir}/runAQI_out/plot_AQI.out --output ${outdir}/runAQI_out/plot_AQI.out.pdf
20 | 
21 | ### !!!! remove intermediate files !!!!
22 | # rm -rf ${outdir}/SRout/*sort* ${outdir}/SRout/*tmp ${outdir}/SRout/Nonmap.loc ${outdir}/LRout/*sort* ${outdir}/LRout/Nonmap.loc
23 | 


--------------------------------------------------------------------------------
/other/outlier.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import pandas as pd
 3 | 
 4 | def remove_outliers_std(data_series, std_multiplier=3):
 5 |     """
 6 |     使用标准差方法剔除离群值。
 7 |     :param data_series: pandas.Series，包含数值数据。
 8 |     :param std_multiplier: 标准差的倍数，用于确定离群值的阈值，默认为3。
 9 |     :return: 剔除离群值后的Series。
10 |     """
11 |     mean = data_series.mean()
12 |     std_dev = data_series.std()
13 |     lower_bound = mean - std_multiplier * std_dev
14 |     upper_bound = mean + std_multiplier * std_dev
15 |     return data_series[(data_series >= lower_bound) & (data_series <= upper_bound)]
16 | 
17 | def main(input_file, output_file):
18 |     # 读取数据文件
19 |     data = pd.read_csv(input_file, sep='\t')
20 | 
21 |     # 假设第一列是样本名，第二列是数值
22 |     sample_names = data.iloc[:, 0]
23 |     values = data.iloc[:, 1]
24 | 
25 |     # 剔除离群值
26 |     cleaned_values = remove_outliers_std(values)
27 | 
28 |     # 创建一个新的DataFrame来保存结果
29 |     cleaned_data = pd.DataFrame({'Sample': sample_names, 'Values': cleaned_values})
30 | 
31 |     # 将结果保存到新的文件
32 |     cleaned_data.to_csv(output_file, sep='\t', index=False)
33 |     print(f"离群值已剔除，结果已保存到 {output_file}")
34 | 
35 | if __name__ == "__main__":
36 |     if len(sys.argv) != 3:
37 |         print("Usage: python script.py <input_file> <output_file>")
38 |         sys.exit(1)
39 | 
40 |     input_file_path = sys.argv[1]
41 |     output_file_path = sys.argv[2]
42 | 
43 |     main(input_file_path, output_file_path)
44 | 


--------------------------------------------------------------------------------
/genome/Hic/haphic/haphic.sh:
--------------------------------------------------------------------------------
 1 | ### https://github.com/zengxiaofei/HapHiC
 2 | 
 3 | micromamba activate haphic
 4 | 
 5 | export PATH="/01_soft/samblaster/:$PATH"
 6 | export PATH="/01_soft/HapHiC/:$PATH"
 7 | export PATH="/01_soft/HapHiC/utils/:$PATH"
 8 | 
 9 | genome=$PWD/YZ.keep.fa
10 | hic1=$PWD/YZ_clean.R1.fq.gz
11 | hic2=$PWD/YZ_clean.R2.fq.gz
12 | cpu=20
13 | chromosome_num=23
14 | 
15 | #bwa index ${genome}
16 | bwa mem -5SP -t ${cpu} ${genome} ${hic1} ${hic2} | samblaster | samtools view - -@ ${cpu} -S -h -b -F 3340 -o HiC.bam
17 | filter_bam HiC.bam 1 --nm 3 --threads ${cpu} | samtools view - -b -@ ${cpu} -o HiC.filtered.bam
18 | 
19 | # pipline
20 | # haphic pipeline ${genome} HiC.filtered.bam ${chromosome_num} --max_inflation 10 --threads ${cpu}
21 | 
22 | # step1
23 | haphic cluster --threads ${cpu} --max_inflation 10 ${genome} HiC.filtered.bam ${chromosome_num}
24 | 
25 | # step2
26 | x=`grep "recommend_inflation" HapHiC_cluster.log | awk -F "inflation from" '{print $2}' | awk -F " " '{print $1}'`
27 | haphic reassign --nclusters ${chromosome_num} --threads ${cpu} ${genome} full_links.pkl inflation_$x/mcl_inflation_$x.clusters.txt paired_links.clm
28 | 
29 | # step3
30 | haphic sort ${genome} HT_links.pkl split_clms final_groups/group*.txt --processes ${cpu}
31 | 
32 | # step4
33 | haphic build ${genome} ${genome} HiC.filtered.bam final_tours/group*.tour
34 | 
35 | # plot
36 | haphic plot scaffolds.raw.agp HiC.filtered.bam
37 | 


--------------------------------------------------------------------------------
/genome/puerge_halp/kmerdup/step3_filter.sh:
--------------------------------------------------------------------------------
 1 | export PATH="/01_soft/kmerDedup/:$PATH"
 2 | 
 3 | prefix=YZ
 4 | mer_len=19
 5 | work_dir=$PWD
 6 | bam_dir=${work_dir}/mapping
 7 | cpu=30
 8 | 
 9 | grep "" shell/*sh.e* | sed 's/:/\t/g;s/\//\t/g;s/.fa.sh./\t/g' | cut -f 2,4 > alignment_rate.txt
10 | ls ${bam_dir}/*.bam > bam.list
11 | samtools merge -@ ${cpu} -n -f -b bam.list ${prefix}.keymer_map.bam
12 | BamDeal statistics Coverage -i ${prefix}.keymer_map.bam -r ${prefix}.format.fa -q 0 -o ${prefix}.cov
13 | 
14 | ## -mpr  <float>  max duplication percentage  [0.3]
15 | ## -mcv  <int>    min k-mer coverage(%)       [30]
16 | ## -mode <1/2>    1:ratio only; 2:ratio * cov [2]
17 | 
18 | perl /01_soft/kmerDedup/kmerDedup/kmerDedup.pl -k ${prefix} -mpr 0.3 -mcv 30 -kmer ${mer_len} -o kmerdedup_mpr3 -f ${prefix}.format.fa -bam ${prefix}.keymer_map.bam -cov ${prefix}.cov.stat -s samtools -t ${cpu} -mode 2
19 | cp kmerdedup_mpr3/${prefix}.dump.hash ./
20 | 
21 | ## plot
22 | sed '1d' kmerdedup_mpr3/${prefix}.all.xls | cut -f 1,2,7,8 | sort -k 3nr,3 | awk '{print NR","$3","$1","$2","$4}' | csvtk add-header -n num,cov,name,length,state > ${prefix}.all.deal.csv
23 | csvtk plot line --height 6 --width 20 -x 1 -y 2 ${prefix}.all.deal.csv > ${prefix}.all.deal.csv.png
24 | csvtk pretty ${prefix}.all.deal.csv > ${prefix}.all.deal.pretty.csv
25 | 
26 | ### remove tmp
27 | #rm -rf ${prefix}.count.jf ${prefix}.dump.all ${prefix}.filt.fa split mapping shell *bt2 ${prefix}.keymer_map.bam
28 | 


--------------------------------------------------------------------------------
/other/check_pid_info.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ####################################################
 3 | 
 4 | read -p "请输入要查询的进程名或PID: " NAME_PID
 5 | 
 6 | # 判断是否有输入值
 7 | if [ -z "${NAME_PID}" ]; then
 8 |     echo "请输入一个有效的进程名或PID！！"
 9 |     exit 1
10 | fi
11 | 
12 | # 检查输入是否为数字，如果不是则认为是进程名
13 | if [[ "${NAME_PID}" =~ ^[0-9]+$ ]]; then
14 |     PIDS=(${NAME_PID})
15 | else
16 |     PIDS=($(pgrep -x "${NAME_PID}"))
17 |     if [ -z "$PIDS" ]; then
18 |         echo "该进程名不存在！！"
19 |         exit 1
20 |     fi
21 | fi
22 | 
23 | for PID in "${PIDS[@]}"; do
24 |     # 检查进程是否存在
25 |     if ! ps -p $PID &> /dev/null ; then
26 |         echo "该PID不存在！！"
27 |         continue
28 |     fi
29 | 
30 |     # 获取并显示进程的基本信息
31 |     echo "------------------------------------------------"
32 |     printf "%-20s %s\n" "进程PID:" "$PID"
33 |     printf "%-20s %s\n" "进程命令:" "$(ps -p $PID -o cmd=)"
34 |     printf "%-20s %s%%\n" "CPU占用率:" "$(ps -p $PID -o %cpu=)"
35 |     printf "%-20s %s%%\n" "内存占用率:" "$(ps -p $PID -o %mem=)"
36 |     printf "%-20s %s\n" "进程所属用户:" "$(ps -p $PID -o user=)"
37 |     printf "%-20s %s\n" "进程当前状态:" "$(ps -p $PID -o stat=)"
38 |     printf "%-20s %.2f MB\n" "进程虚拟内存:" "$(echo "$(ps -p $PID -o vsz=) / 1024" | bc -l)"
39 |     printf "%-20s %.2f MB\n" "进程共享内存:" "$(echo "$(ps -p $PID -o rss=) / 1024" | bc -l)"
40 |     printf "%-20s %s\n" "进程运行持续时间:" "$(ps -p $PID -o etime=)"
41 |     printf "%-20s %s\n" "进程开始运行时间:" "$(ps -p $PID -o lstart=)"
42 |     echo "------------------------------------------------"
43 | done
44 | 


--------------------------------------------------------------------------------
/genome/Anno_EGAPx/fix_phase.py:
--------------------------------------------------------------------------------
 1 | #!/dellfsqd2/ST_OCEAN/USER/lishuo11/01_soft/mambaforge/bin/python3
 2 | 
 3 | import sys
 4 | gff = sys.argv[1]
 5 | 
 6 | with open(gff,'r') as infile:
 7 | 	with open('mrna_'+gff,'w') as outfile:
 8 | 		ls_gene = []
 9 | 		for line in infile:
10 | 			chrom = line.split('\t')[0]
11 | 			sam = line.split('\t')[1]
12 | 			id = line.split('\t')[5]
13 | 			info = line.split('\t')[8]
14 | 			phase = line.split('\t')[7]
15 | 			region = line.split('\t')[2]
16 | 			pos_ne = line.split('\t')[6]
17 | 			start = line.split('\t')[3]
18 | 			end = line.split('\t')[4]
19 | 			if region == 'mRNA':
20 | 				min = 'NA'
21 | 				max = 'NA'
22 | 				for ls in ls_gene:
23 | 					if ls[2] == 'CDS':
24 | 						if min == 'NA' or int(ls[3]) < int(min):
25 | 							min = ls[3]
26 | 						if max == 'NA' or int(ls[4]) > int(max):
27 | 							max = ls[4]
28 | 				for ls in ls_gene:
29 | 					if ls[2] == 'mRNA':
30 | 						ls[3] = min
31 | 						ls[4] = max
32 | 					outfile.write('\t'.join(ls))
33 | 				ls_gene = []				
34 | 			ls = [chrom, sam, region, start, end, id, pos_ne, phase, info] 	
35 | 			ls_gene.append(ls)
36 | 		
37 | 		min = 'NA'
38 | 		max = 'NA'
39 | 		for ls in ls_gene:
40 | 			if ls[2] == 'CDS':
41 | 				if min == 'NA' or int(ls[3]) < int(min):
42 | 					min = ls[3]
43 | 				if max == 'NA' or int(ls[4]) > int(max):
44 | 					max = ls[4]
45 | 		for ls in ls_gene:
46 | 			if ls[2] == 'mRNA':
47 | 				ls[3] = min
48 | 				ls[4] = max
49 | 			outfile.write('\t'.join(ls))
50 | 	outfile.close()
51 | infile.close()
52 | 					
53 | 


--------------------------------------------------------------------------------
/deal_gff/pick_longest_gene/fix_phase.py:
--------------------------------------------------------------------------------
 1 | #!/dellfsqd2/ST_OCEAN/USER/lishuo11/01_soft/mambaforge/bin/python3
 2 | 
 3 | import sys
 4 | gff = sys.argv[1]
 5 | 
 6 | with open(gff,'r') as infile:
 7 | 	with open('mrna_'+gff,'w') as outfile:
 8 | 		ls_gene = []
 9 | 		for line in infile:
10 | 			chrom = line.split('\t')[0]
11 | 			sam = line.split('\t')[1]
12 | 			id = line.split('\t')[5]
13 | 			info = line.split('\t')[8]
14 | 			phase = line.split('\t')[7]
15 | 			region = line.split('\t')[2]
16 | 			pos_ne = line.split('\t')[6]
17 | 			start = line.split('\t')[3]
18 | 			end = line.split('\t')[4]
19 | 			if region == 'mRNA':
20 | 				min = 'NA'
21 | 				max = 'NA'
22 | 				for ls in ls_gene:
23 | 					if ls[2] == 'CDS':
24 | 						if min == 'NA' or int(ls[3]) < int(min):
25 | 							min = ls[3]
26 | 						if max == 'NA' or int(ls[4]) > int(max):
27 | 							max = ls[4]
28 | 				for ls in ls_gene:
29 | 					if ls[2] == 'mRNA':
30 | 						ls[3] = min
31 | 						ls[4] = max
32 | 					outfile.write('\t'.join(ls))
33 | 				ls_gene = []				
34 | 			ls = [chrom, sam, region, start, end, id, pos_ne, phase, info] 	
35 | 			ls_gene.append(ls)
36 | 		
37 | 		min = 'NA'
38 | 		max = 'NA'
39 | 		for ls in ls_gene:
40 | 			if ls[2] == 'CDS':
41 | 				if min == 'NA' or int(ls[3]) < int(min):
42 | 					min = ls[3]
43 | 				if max == 'NA' or int(ls[4]) > int(max):
44 | 					max = ls[4]
45 | 		for ls in ls_gene:
46 | 			if ls[2] == 'mRNA':
47 | 				ls[3] = min
48 | 				ls[4] = max
49 | 			outfile.write('\t'.join(ls))
50 | 	outfile.close()
51 | infile.close()
52 | 					
53 | 


--------------------------------------------------------------------------------
/genome/Hic/haphic/re_draw.sh:
--------------------------------------------------------------------------------
 1 | ### https://github.com/zengxiaofei/HapHiC
 2 | 
 3 | micromamba activate haphic
 4 | 
 5 | export PATH="/01_soft/HapHiC/:$PATH"
 6 | export PATH="/01_soft/HapHiC/utils/:$PATH"
 7 | 
 8 | agp=chr.fa.agp
 9 | matrix=contact_matrix.pkl
10 | cpu=30
11 | bin_size=1000		## bin size for generating contact matrix, default: 500 (kbp)
12 | cmap=viridis		## define the colormap for the heatmap, default: white,red. It can be any built-in sequential colormap from Matplotlib (refer to:
13 | 			## https://matplotlib.org/stable/users/explain/colors/colormaps.html). You can create a custom colormap by listing colors separated by commas
14 | normalization=KR	## method for matrix normalization, default: KR {KR,log10,none}
15 | ncols=5			## number of scaffolds per row in `separate_plots.pdf`, default: 5
16 | origin=top_left		## set the origin of each heatmap, default: bottom_left {bottom_left,top_left,bottom_right,top_right}
17 | border_style=outline	## border style for scaffolds, default: grid {grid,outline}
18 | figure_width=15		## figure width, default: 15 (cm)
19 | figure_height=12	## figure height, default: 12 (cm)
20 | # specified_scaffolds=
21 | # min_len=
22 | # vmax_coef=
23 | # manual_vmax=
24 | # separate_plots	## generate `separate_plots.pdf`, depicting the heatmap for each scaffold individually, default: False
25 | 
26 | haphic plot ${agp} ${matrix} --bin_size ${bin_size} --cmap ${cmap} --normalization ${normalization} --ncols ${ncols} --origin ${origin} --border_style ${border_style} --figure_width ${figure_width} --figure_height ${figure_height} --threads ${cpu}
27 | 


--------------------------------------------------------------------------------
/genome/Hic/haphic/draw.sh:
--------------------------------------------------------------------------------
 1 | ### https://github.com/zengxiaofei/HapHiC
 2 | 
 3 | micromamba activate haphic
 4 | 
 5 | export PATH="/01_soft/HapHiC/:$PATH"
 6 | export PATH="/01_soft/HapHiC/utils/:$PATH"
 7 | 
 8 | agp=chr.fa.agp
 9 | bam=HiC.filtered.bam
10 | cpu=30
11 | bin_size=1000		## bin size for generating contact matrix, default: 500 (kbp)
12 | cmap=viridis		## define the colormap for the heatmap, default: white,red. It can be any built-in sequential colormap from Matplotlib (refer to:
13 | 			## https://matplotlib.org/stable/users/explain/colors/colormaps.html). You can create a custom colormap by listing colors separated by commas
14 | normalization=KR	## method for matrix normalization, default: KR {KR,log10,none}
15 | ncols=5			## number of scaffolds per row in `separate_plots.pdf`, default: 5
16 | origin=top_right	## set the origin of each heatmap, default: bottom_left {bottom_left,top_left,bottom_right,top_right}
17 | border_style=outline	## border style for scaffolds, default: grid {grid,outline}
18 | figure_width=15		## figure width, default: 15 (cm)
19 | figure_height=12	## figure height, default: 12 (cm)
20 | # specified_scaffolds=
21 | # min_len=
22 | # vmax_coef=
23 | # manual_vmax=
24 | # separate_plots	## generate `separate_plots.pdf`, depicting the heatmap for each scaffold individually, default: False
25 | 
26 | haphic plot ${agp} ${bam} --bin_size ${bin_size} --cmap ${cmap} --normalization ${normalization} --ncols ${ncols} --origin ${origin} --border_style ${border_style} --figure_width ${figure_width} --figure_height ${figure_height} --threads ${cpu}
27 | 
28 | ## rm *bam
29 | 


--------------------------------------------------------------------------------
/Comparative_genomics/gene_family/06_rna_seq.sh:
--------------------------------------------------------------------------------
 1 | seqkit shuffle ./00_data/cds.fa -o ./00_data/cds.fa.shuffle 
 2 | OLDIFS="$IFS" 
 3 | IFS=$'\n' 
 4 | for line in $(cat ./00_data/rna_data_list); do 
 5 |     IFS="$OLDIFS" 
 6 |     read -r name rna_seq_data1 rna_seq_data2 <<< "$line" 
 7 |     fastp -i ./00_data/"$rna_seq_data1" -o ./3_rna_seq/01_fastp/"$rna_seq_data1" -I ./00_data/"$rna_seq_data2" -O ./3_rna_seq/01_fastp/"$rna_seq_data2" 
 8 |     salmon index -t ./00_data/cds.fa.shuffle -i ./3_rna_seq/02_salmon/cds.fa.index -p 20 -k 31 
 9 |     salmon quant -i ./3_rna_seq/02_salmon/cds.fa.index --validateMappings -l A -p 8 -1 ./3_rna_seq/01_fastp/"$rna_seq_data1" -2 ./3_rna_seq/01_fastp/"$rna_seq_data2" -o ./3_rna_seq/02_salmon/"$rna_seq_data1".quant 
10 |     grep -f ./1_identify_gene_family/04_final_gene_family/final_gene_id ./3_rna_seq/02_salmon/"$rna_seq_data1".quant/quant.sf |sort > ./3_rna_seq/03_gene_family.quant/"$rna_seq_data1".quant 
11 |     echo "$name" > ./3_rna_seq/04_visualization/"$name".sf.TPM ; awk '{print $4}' ./3_rna_seq/03_gene_family.quant/"$rna_seq_data1".quant >> ./3_rna_seq/04_visualization/"$name".sf.TPM 
12 |     IFS=$'\n' 
13 | done 
14 | IFS="$OLDIFS" 
15 | echo 'gene name' > ./3_rna_seq/04_visualization/gene_name ; awk '{print $1}' ./1_identify_gene_family/04_final_gene_family/final_gene_id >> ./3_rna_seq/04_visualization/gene_name 
16 | find ./3_rna_seq/04_visualization/ -name '*.sf.TPM' |paste -sd '	' > ./3_rna_seq/04_visualization/TPM_file_path 
17 | paste ./3_rna_seq/04_visualization/gene_name $(cat ./3_rna_seq/04_visualization/TPM_file_path) > ./3_rna_seq/04_visualization//final_matrix


--------------------------------------------------------------------------------
/genome/evaluate_genome_size/evaluate_genome_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | set +o posix
 3 | 
 4 | prefix=yu
 5 | NGS_fq1=ABCD_1.clean.fq.gz
 6 | NGS_fq2=ABCD_2.clean.fq.gz
 7 | mer_len=19
 8 | cpu=30
 9 | ploidy=2
10 | read_len=150
11 | 
12 | ## count k-mers
13 | jellyfish count -m ${mer_len} -s 1000000000 -t ${cpu} -C -o ${prefix}.count.jf <(pigz -p 5 -d -c ${NGS_fq1}) <(pigz -p 5 -d -c ${NGS_fq2})
14 | #jellyfish count -m ${mer_len} -s 1000000000 -t ${cpu} -C -o ${prefix}.count *.fq
15 | 
16 | ## if jellyfish gives you more than one count file, you need merge them firs
17 | # jellyfish merge -v -o ${prefix}.count.jf ${prefix}.count_*
18 | 
19 | jellyfish stats -o ${prefix}.stats ${prefix}.count.jf
20 | jellyfish histo -t ${cpu} ${prefix}.count.jf > ${prefix}.histo
21 | 
22 | <<another_soft
23 | [ -d tmp ] || mkdir tmp
24 | /01_soft/KMC/bin/kmc -k19 -t20 -m100 -ci1 -cs10000 @fq.list reads tmp/
25 | /01_soft/KMC/bin/kmc_tools transform reads histogram reads.histo -cx10000
26 | another_soft
27 | 
28 | ## plot, not work, have some problem
29 | Rscript /genomescope2.0/genomescope.R -i ${prefix}.histo -o ${prefix} -k ${mer_len} -n ${prefix}.model_2 --ploidy {ploidy}
30 | Rscript /genomescope/genomescope.R ${prefix}.histo ${mer_len} ${read_len} result2 100000 verbose
31 | 
32 | <<another_soft2
33 | awk '{print $1"\t"$2}' ${prefix}.histo > ${prefix}.histo.tsv
34 | Genomeye -k ${mer_len} ${prefix}.histo.tsv > Genomeye.result
35 | ## check kmernum: cat ${prefix}.stats
36 | ## kmernum: 118233500882    expected_depth_for_unique_kmer: 76
37 | gce -g 118233500882 -f ${prefix}.histo.tsv -m 1 -D 1 -c 76 >O.gce.table 2>O.gce.log
38 | tail O.gce.log
39 | another_soft2
40 | 


--------------------------------------------------------------------------------
/deal_fasta/fa2phy/fa2phy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | import sys
 3 | # usage
 4 | USAGE = "\nusage: python2 fa2phy.py [input fasta file] [output phy file]\n"
 5 | def parseFasta(filename):
 6 |     fas = {}
 7 |     id = None
 8 |     with open(filename, 'r') as fh:
 9 |         for line in fh:
10 |             if line[0] == '>':
11 |                 header = line[1:].rstrip()
12 |                 id = header.split()[0]
13 |                 fas[id] = []
14 |             else:
15 |                 fas[id].append(line.rstrip())
16 |         for id, seq in fas.iteritems():
17 |             fas[id] = ''.join(seq)
18 |     return fas
19 | if len(sys.argv) !=3:
20 |     print USAGE
21 |     sys.exit()
22 | fas = parseFasta(sys.argv[1])
23 | outfile = sys.argv[2]
24 | sequence_list = [] # To keep order of sequence
25 | sequence_dict = {}
26 | for rec in fas:
27 |     sequence_list.append(rec)
28 |     sequence_dict[rec] = fas[rec]
29 | # Test length of the alignment:
30 | alignment_length = 0
31 | for gene in sequence_dict:
32 |     if (alignment_length != 0) and (len(sequence_dict[gene]) != alignment_length):
33 |         print "Error in alignment length, exit on error !!!"
34 |         sys.exit()
35 |     else:
36 |         alignment_length = len(sequence_dict[gene])
37 | number_of_seq = len(sequence_dict)
38 | longest_id = sorted(sequence_dict.keys(), key = lambda k: len(k))[-1]
39 | # Write alignment in Phylip format
40 | phyfile = open(outfile, "w")
41 | phyfile.write(str(number_of_seq)+" "+str(alignment_length)+"\n")
42 | for gene in sequence_list:
43 |     phyfile.write(gene.ljust(len(longest_id), ' ') + "   " + sequence_dict[gene] + "\n")
44 | phyfile.close()
45 | 


--------------------------------------------------------------------------------
/genome/assess/GCI_pb.sh:
--------------------------------------------------------------------------------
 1 | ##https://github.com/yeeus/GCI
 2 | 
 3 | genome=curated.fasta
 4 | threads=70
 5 | pb_ccs_fq=ccs.fq.gz
 6 | kmer=17
 7 | 
 8 | ## Map HiFi and/or ONT reads to assemblies minimap2
 9 | minimap2 -t $threads -ax map-hifi $genome $pb_ccs_fq > align.sam   ## mapping ONT reads with -ax map-ont
10 | samtools view -@ $threads -Sb align.sam | samtools sort -@ $threads -o align.bam
11 | samtools index align.bam
12 | rm align.sam
13 | 
14 | ## winnowmap
15 | /01_soft/meryl-1.4.1/bin/meryl count k=${kmer} output merylDB ${genome}
16 | /01_soft/meryl-1.4.1/bin/meryl print greater-than distinct=0.9998 merylDB > mat_repetitive_k${kmer}.txt
17 | /01_soft/Winnowmap/bin/winnowmap -k ${kmer} -W mat_repetitive_k${kmer}.txt -ax map-pb $genome $pb_ccs_fq > align2.sam
18 | samtools view -@ $threads -Sb align2.sam | samtools sort -@ $threads -o align2.bam
19 | samtools index align2.bam
20 | rm align2.sam
21 | 
22 | ## mapquik
23 | /usr/bin/singularity exec --bind $PWD:$PWD mapquik.sif mapquik --low-memory --parallelfastx --threads ${threads} -p mapquik -k ${kmer} -d ${mapquik_density} -l ${mapquik_lmer} ${pb_ccs_fq} --reference ${genome}
24 | 
25 | ## veritymap
26 | python /01_soft/VerityMap/veritymap/main.py --reads ${pb_ccs_fq} -d hifi-diploid -o veritymap ${genome} -t ${threads}
27 | 
28 | # We recommend to input one bam and one paf file produced by two softwares (for example, one bam file from winnowmap and one paf file from minimap2)
29 | # PDF is recommended because PNG file may lose some details though GCI will output png files by default
30 | 
31 | ## !!!! select by yourself !!!! 
32 | /01_soft/GCI/GCI.py -r ${genome} --hifi align.bam align2.bam -t $threads -p -it pdf
33 | 


--------------------------------------------------------------------------------
/genome/TE/HiTEv3.2.sh:
--------------------------------------------------------------------------------
 1 | ## https://github.com/CSU-KangHu/HiTE#cmd
 2 | 
 3 | genome=$PWD/Anneissia_japonica.fa	### absolute paths
 4 | out_dir=$PWD/Hite_out		### absolute paths
 5 | cpu=10
 6 | curated_lib=/06_database/fish_te/animal_fish.rmdup.lib		## Provide a fully trusted curated library, which will be used to pre-mask highly homologous sequences in the genome.
 7 | isplant=1		## Is it a plant genome, 1: true, 0: false.
 8 | isremove_nested=1	## Whether to remove nested TE, 1: true, 0: false.
 9 | isrecover=1		## whether to enable recovery mode to avoid starting from the beginning, 1: true, 0: false.
10 | isdomain=1		## Whether to obtain TE domains, HiTE uses RepeatPeps.lib from RepeatMasker to obtain TE domains, 1: true, 0: false.
11 | isannotate=1		## Whether to annotate the genome using the TE library generated, 1: true, 0: false.
12 | isintact_anno=1		## Whether to generate annotation of full-length TEs, 1: true, 0: false.
13 | isBM_RM2=1		## Whether to conduct benchmarking of RepeatModeler2, 1: true, 0: false.
14 | isBM_HiTE=1		## Whether to conduct benchmarking of HiTE, 1: true, 0: false.
15 | 
16 | ## for help
17 | # /usr/bin/singularity exec /01_soft/singularity_all/HiTE.sif python /01_soft/HiTEv3.2/main.py -h
18 | 
19 | /usr/bin/singularity exec /01_soft/singularity_all/HiTE.sif \
20 | 	python /01_soft/HiTEv3.2/main.py \
21 | 	--genome ${genome} \
22 | 	--thread ${cpu} \
23 | 	--outdir ${out_dir} \
24 | 	--chunk_size 200 \
25 | 	--plant ${isplant} \
26 |  	--remove_nested ${isremove_nested} \
27 | 	--domain ${isdomain} \
28 | 	--recover ${isrecover} \
29 | 	--annotate ${isannotate} \
30 | 	--intact_anno ${isintact_anno} \
31 | 	--BM_RM2 ${isBM_RM2} \
32 | 	--BM_HiTE ${isBM_HiTE} \
33 | 	--curated_lib ${curated_lib}
34 | 


--------------------------------------------------------------------------------
/genome/puerge_halp/kmerdup/step1_prepare.sh:
--------------------------------------------------------------------------------
 1 | export PATH="/01_soft/kmerDedup/:$PATH"
 2 | 
 3 | prefix=YZ
 4 | genome=curated.fasta
 5 | mer_len=19
 6 | cpu=70
 7 | counter_len=8
 8 | NGS_fq1=YZ_clean.R1.fq.gz
 9 | NGS_fq2=YZ_clean.R2.fq.gz
10 | ccs_fa=yz.fasta.gz
11 | 
12 | ## step1 count k-mers
13 | jellyfish count -m ${mer_len} -s 100M -t ${cpu} -c ${counter_len} -C -o ${prefix}.count <(pigz -p 5 -d -c ${NGS_fq1}) <(pigz -p 5 -d -c ${NGS_fq2}) <(zcat ${ccs_fa})
14 | 
15 | ## if jellyfish gives you more than one count file, you need merge them firs
16 | # jellyfish merge -v -o ${prefix}.count.jf ${prefix}.count_*
17 | 
18 | mv ${prefix}.count ${prefix}.count.jf
19 | 
20 | ## step2 stat and histo (you can skip)
21 | jellyfish stats -o ${prefix}.stats ${prefix}.count.jf
22 | jellyfish histo -t ${cpu} ${prefix}.count.jf | perl -lane 'my ($dpt, $cnt) = split(/\s+/, $_); my $nn = $dpt * $cnt;print "$dpt\t$cnt\t$nn"' > ${prefix}.histo
23 | 
24 | ## plot, not work, have some problem
25 | # Rscript /01_software/genomescope/genomescope2.0/genomescope.R -i ${prefix}.histo -o ${prefix} -k ${mer_len} -n ${prefix}.model_2
26 | 
27 | ## step3 dump k-mers
28 | jellyfish dump -c -t -o ${prefix}.dump.all ${prefix}.count.jf
29 | perl /01_soft/kmerDedup/kmerDedup/kmerFilter.pl -d ${prefix}.dump.all -o ${prefix}.filt.fa -l 3 -u 100000
30 | perl /01_soft/kmerDedup/kmerDedup/splitFasta.pl -f ${prefix}.filt.fa -o split -k ${prefix}.kmer
31 | 
32 | ## step4 mapping k-mers
33 | perl  /01_soft/kmerDedup/kmerDedup/fa2fa.pl -f ${genome} -o ${prefix}.format.fa -c F -n F -l 1000
34 | 
35 | bowtie2-build --threads ${cpu} ${prefix}.format.fa ${prefix}.format
36 | ls split/ > split.id
37 | [ -d shell ] || mkdir shell
38 | 
39 | for i in $(cat split.id); do sed "s/ABCD/${i}/g" bowtie2_demo.sh > shell/bowtie2_${i}.sh; done
40 | 


--------------------------------------------------------------------------------
/Comparative_genomics/gene_cluster/Galeon.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ## https://github.com/molevol-ub/galeon
 3 | # micromamba activate Galeon
 4 | source activate /micromamba/envs/Galeon
 5 | 
 6 | export PATH=/01_soft/galeon/GALEON_masterScripts:$PATH
 7 | export PATH=/mambaforge/envs/R-4.2/bin/:$PATH
 8 | 
 9 | genome=fcs.fa
10 | EVM=fcs.EVM.bgi.filter.gff
11 | ID=dom.select.id
12 | famliy_name=FRED
13 | GVALUE=100	## ## 软件估计在多个碱基中发现的预期基因数量，以及在g输入值中预期的基因数量，并估计在 GVALUE 大小（即：GVALUE Kb）的窗口中发现2个或更多基因的概率，在以下分析中，这些基因将被视为一个簇.
14 | cpu=48
15 | 
16 | fishInWinter.pl ${ID} ${EVM} > ${ID}.gff
17 | gffread ${ID}.gff -g ${genome} -x ${ID}.gff.cds -y ${ID}.gff.pep
18 | 
19 | [ -d GFFs ] || mkdir GFFs
20 | [ -d Proteins ] || mkdir Proteins
21 | 
22 | awk '$3=="mRNA"' ${ID}.gff | cut -f 1,4,5,9 | sed 's/ID=//g;s/;//g' > GFFs/${famliy_name}_fam.bed2
23 | sed "s/U$//g" ${ID}.gff.pep > Proteins/${famliy_name}_fam.fasta
24 | 
25 | mafft --auto --thread ${cpu} ${ID}.gff.pep > Proteins/${famliy_name}_fam.aln
26 | 
27 | GALEON_ControlScript.py clusterfinder -a GFFs/ -p Proteins/ -e enabled -pm True -F WithinFamilies -g ${GVALUE} -emx_pos Lower -c two -t FastTree -f orange -outdir cluster_${famliy_name} -log Log_dir
28 | 
29 | GALEON_GetEvoStats.py -clust cluster_${famliy_name} -prot Proteins/ -coords GFFs
30 | 
31 | perl /dellfsqd2/ST_OCEAN/USER/lishuo11/01_soft/galeon/GALEON_masterScripts/Scripts/Get_scaffold_length.pl ${genome}
32 | 
33 | ## -sfilter ALL|NUM|FILE The summary plots will represent the results for a "NUM" number of largest scaffolds; a list of scaffolds of interest provided as a single column in an input "FILE"; "ALL" the scaffolds (often too many, the resulting summary plots might not informative).
34 | 
35 | GALEON_SummaryFiles.py -fam ${famliy_name} -clust cluster_${famliy_name} -coords GFFs -ssize ChrSizes.txt -sfilter 10
36 | 


--------------------------------------------------------------------------------
/genome/Anno_denovo/helixer.sh:
--------------------------------------------------------------------------------
 1 | ## https://github.com/weberlab-hhu/Helixer
 2 | 
 3 | species=hcs
 4 | genome=$PWD/HCS_chr.fa
 5 | out_gff=$PWD/helixer.predict.gff
 6 | lineage=invertebrate	# {vertebrate,land_plant,fungi,invertebrate}
 7 | model=/Helixer_models/models/invertebrate/invertebrate_v0.3_m_0100.h5
 8 | # fungi: /Helixer_models/models/fungi/fungi_v0.3_a_0100.h5
 9 | # invertebrate: /Helixer_models/models/invertebrate/invertebrate_v0.3_m_0100.h5
10 | # land_plant: /Helixer_models/models/land_plant/land_plant_v0.3_a_0080.h5
11 | # vertebrate: /Helixer_models/models/vertebrate/vertebrate_v0.3_m_0080.h5
12 | batch_size=8		# large batch_size means needing more GPU memory. {5,6} may ok on one GPU card.
13 | TMP=$PWD/TMP
14 | 
15 | ### get help
16 | # /usr/bin/singularity exec --bind $PWD/:$PWD/ helixer.sif Helixer.py -h
17 | 
18 | ### download best model to !!!!!! your homedir .local/share/Helixer !!!!!
19 | #/usr/bin/singularity exec --bind $PWD/:$PWD/ helixer.sif fetch_helixer_models.py
20 | 
21 | ### main script. "--nv" mean support GPU, cpu is also ok.
22 | [ -d TMP ] || mkdir TMP
23 | /usr/bin/singularity run --bind $PWD/:$PWD/ --nv sif/helixer.sif Helixer.py \
24 | 	--subsequence-length 213840 \
25 | 	--overlap-offset 106920 \
26 | 	--overlap-core-length 160380 \
27 | 	--batch-size ${batch_size} \
28 | 	--lineage ${lineage} \
29 | 	--temporary-dir ${TMP} \
30 | 	--species ${species} \
31 | 	--model-filepath ${model} \
32 | 	--fasta-path ${genome} \
33 | 	--gff-output-path ${out_gff}
34 | 
35 | grep -v "#" ${out_gff} | awk '$3=="mRNA" || $3=="CDS"' |  awk -F "[;\t]" '{if ($3~/mRNA/) print $0"\t"$9";"; else print $0"\t"$10";" }' | cut -f 1-8,10 > tmp.gff
36 | fix_mRNA_coordinate.pl tmp.gff helixer.bgi.gff
37 | gffread helixer.bgi.gff -g ${genome} -x helixer.bgi.gff.cds -y helixer.bgi.gff.pep
38 | rm -rf tmp.gff ${TMP}
39 | 


--------------------------------------------------------------------------------
/picture/synteny_circos/circos_sys.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set +o posix
 3 | set -eo pipefail
 4 | 
 5 | export PATH="/01_software/lastal/bin/:$PATH"
 6 | export PATH="/01_software/latex/bin/x86_64-linux/:$PATH"
 7 | python=/01_soft/mamba/env/jcvi/bin/python
 8 | 
 9 | ## use ${sp1}.bed ${sp2}.bed ${sp1}.pep ${sp2}.pep ${sp1}.genome.fa ${sp2}.genome.fa
10 | sp1=O_curv
11 | sp2=O_mela
12 | 
13 | ### dotplot
14 | ${python} -m jcvi.compara.catalog ortholog --dbtype prot --cpus=1 --no_strip_names ${sp1} ${sp2}
15 | 
16 | ### make .simple
17 | ${python} -m jcvi.compara.synteny screen --simple ${sp1}.${sp2}.anchors ${sp1}.${sp2}.anchors.new
18 | simpletolink.py ${sp1}.${sp2}.anchors.simple
19 | 
20 | ### use ${sp1}.genome.fa ${sp2}.genome.fa
21 | iTools Fatools stat -InPut ${sp1}.genome.fa -OutPut ${sp1}.genome.fa.chrlist
22 | iTools Fatools stat -InPut ${sp2}.genome.fa -OutPut ${sp2}.genome.fa.chrlist
23 | 
24 | grep -v "#" ${sp1}.genome.fa.chrlist | cut -f 1,2 | fishInWinter.pl -bf table -ff table <( cut -f 1 ${sp1}.bed | awk '!a[$0]++' ) - | awk '{print "chr - " "'"$sp1"'" "_" $1 " " "'"$sp1"'" "_" $1 " 0 " $2 " " "chr"NR}' > karyotype_sp1.txt
25 | grep -v "#" ${sp2}.genome.fa.chrlist | cut -f 1,2 | fishInWinter.pl -bf table -ff table <( cut -f 1 ${sp2}.bed | awk '!a[$0]++' ) - | awk '{print "chr - " "'"$sp2"'" "_" $1 " " "'"$sp2"'" "_" $1 " 0 " $2 " " "chr"NR}' > karyotype_sp2.txt
26 | 
27 | awk '{print "'"$sp1"'" "_" $1 "\t" $2 "\t" $3 "\t" "'"$sp2"'" "_" $4 "\t" $5 "\t" $6}' ${sp1}.${sp2}.anchors.simple_link.txt > anchors.simple_link.rename.txt
28 | 
29 | generate_circos_configs.py anchors.simple_link.rename.txt
30 | cp circos_config/* .
31 | /usr/bin/singularity exec --bind $PWD:$PWD /01_soft/singularity_all/circos.sif circos -conf circos_config_output/circos.conf
32 | 
33 | rm *.ssp *.tis *.sds *.des *.prj *.suf *.bck *.chrlist
34 | 


--------------------------------------------------------------------------------
/genome/assess/compleasm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ## https://github.com/huangnengCSU/compleasm
 3 | 
 4 | ### Important parameters
 5 | input_genome=genome.fa
 6 | cpu=48
 7 | database_prefix=actinopterygii
 8 | min_bestScore=0.95          # output if score at least FLOAT*bestScore [0.95]
 9 | database_path=
10 | compleasm=~/01_software/compleasm/compleasm.py
11 | 
12 | ### Threshold parameters    # Tips: A threshold that is too low may result in falsely high results
13 | 
14 | min_diff=0.2                # The thresholds for the best matching and second best matching. default=0.2
15 | min_identity=0.6            # The identity threshold for valid mapping results. default=0.4
16 | min_length_percent=0.6      # The fraction of protein for valid mapping results. default=0.6
17 | min_complete=0.9            # The length threshold for complete gene. default=0.9
18 | 
19 | ### other parameters
20 | output_dir=00_assessment
21 | mode=busco                 #lite or busco
22 | #lite:  Without using hmmsearch to filtering protein alignment.
23 | #busco: Using hmmsearch on all candidate predicted proteins to purify the miniprot alignment to improve accuracy.
24 | 
25 | ### CMD
26 | /usr/bin/singularity exec compleasm.sif python ${compleasm} run -a ${input_genome} -o ${output_dir} -l ${database_prefix} -t ${cpu} -m ${mode} --outs ${min_bestScore} -L ${database_path} --min_diff ${min_diff} --min_identity ${min_identity} --min_length_percent ${min_length_percent} --min_complete ${min_complete}
27 | 
28 | rm -rf ${output_dir}/*_odb10/*.done ${output_dir}/*_odb10/hmmer_output
29 | pigz --best ${output_dir}/*_odb10/miniprot_output.gff
30 | pigz --best ${output_dir}/*_odb10/translated_protein.fasta
31 | pigz --best ${output_dir}/*_odb10/gene_marker.fasta
32 | 
33 | ### for more help
34 | ## /usr/bin/singularity exec compleasm.sif python ${compleasm} -h
35 | 


--------------------------------------------------------------------------------
/deal_gff/pick_longest_gene/deal.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set +o posix
 3 | set -eo pipefail
 4 | 
 5 | species=Branchiostoma_floridae
 6 | genome=GCF_000003815.2_Bfl_VNyyK_genomic.fna
 7 | pep=GCF_000003815.2_Bfl_VNyyK_protein.faa
 8 | gff=GCF_000003815.2_Bfl_VNyyK_genomic.gff
 9 | 
10 | pick_longest_ncbi.pl ${gff}
11 | 
12 | awk -F "Dbxref" '{print $1}' clean.gff | sed 's/rna-//g;s/cds-//g;s/gene-//g' | awk '$3=="CDS"' | cut -f 9 | awk '!a[$0]++' | sed 's/ID=//g;s/;Parent=/\t/g;s/;//g' | awk -F "\t" 'NR==FNR{a[$1]=$2}NR!=FNR{print $0"\t"a[$1]}' <(grep '>' ${pep} | sed 's/>//g' | awk -F "[" '{print $1}' | sed 's/ $//g' | sed 's/ /\t/1') - | awk '{print $2"\t"$0}' | awk -F "\t" 'NR==FNR{a[$1]=$2}NR!=FNR{print $0"\t"a[$1]}' <(awk -F "Dbxref" '{print $1}' clean.gff | sed 's/rna-//g;s/cds-//g;s/gene-//g' | awk '$3=="mRNA"' | cut -f 9 | sed 's/ID=//g;s/;Parent=/\t/g;s/;//g') - | cut -f 2- | awk -F "\t" '{print $1"\t"$2"\t"$4"\t"$3}' > ${species}.ncbi.anno
13 | 
14 | awk -F "Dbxref" '{print $1}' clean.gff | sed 's/rna-//g;s/cds-//g;s/gene-//g' | awk -F ";" '{print $1";"}' | awk '{print $9"\t"$0}' | sed 's/ID=//1;s/;//1' | awk -F "\t" 'NR==FNR{a[$1]=$2}NR!=FNR{print $0"\t"a[$1]}' <(awk -F "\t" '{print $2"\t"$1}' ${species}.ncbi.anno | sed 's/\t/\tID=/g' ) - | cut -f 2- | sed 's/ID=/Parent=/1;s/\t$//g' | awk -F "\t" '{if ($3~/mRNA/) print $0";"; else print $0"\t"$9}' | cut -f 1-8,10 > clean.deal.gff
15 | 
16 | fix_mRNA_coordinate.pl clean.deal.gff clean.deal.fix.gff
17 | 
18 | gff3_sort -g clean.deal.fix.gff | grep -v "#" > ${species}.bgi.gff
19 | 
20 | gffread ${species}.bgi.gff -g ${genome} -x ${species}.bgi.gff.cds
21 | seqkit translate --trim --clean ${species}.bgi.gff.cds > ${species}.bgi.gff.pep
22 | grep ">" ${species}.bgi.gff.cds | awk '{print $1}' | sed "s/>//g" | seqtk subseq ${pep} - | awk '{print $1}' | seqkit seq -w 0 > ${species}.bgi.gff.pep2
23 | 
24 | rm clean*gff
25 | 


--------------------------------------------------------------------------------
/genome/assess/LAI.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #https://github.com/oushujun/LTR_retriever
 4 | #https://github.com/oushujun/LTR_FINDER_parallel/tree/v1.1
 5 | #https://github.com/oushujun/LTR_HARVEST_parallel
 6 | #https://www.jianshu.com/p/ed289822c825
 7 | #https://github.com/wangziwei08/LTR-insertion-time-estimation
 8 | #https://www.jianshu.com/p/f962d5c40fdf ### LTR_retriever
 9 | 
10 | genome=hcs.chr.genome.fa
11 | threads=20
12 | substitution_mutations_rate=1e-8
13 | 
14 | ### software path
15 | gt_software=/01_software/gt-1.6.2-Linux_x86_64-64bit-complete/bin/gt
16 | finder=/01_software/LTR_FINDER_parallel/bin/LTR_FINDER.x86_64-1.0.7/ltr_finder
17 | LTR_FINDER_parallel=/01_software/LTR_FINDER_parallel/LTR_FINDER_parallel
18 | LTR_HARVEST_parallel=/01_software/LTR_HARVEST_parallel/LTR_HARVEST_parallel
19 | 
20 | ## step1 LTR_HARVEST
21 | perl ${LTR_HARVEST_parallel} -seq ${genome} -gt ${gt_software} -threads ${threads}
22 | 
23 | ## step2 LTR_FINDER
24 | perl ${LTR_FINDER_parallel} -seq ${genome} -harvest_out -finder ${finder} -t ${threads}
25 | 
26 | ## step3 LTR_retriever
27 | source activate /01_soft/mamba/envs/LTR_retriever
28 | LTR_retriever -genome ${genome} -inharvest ${genome}.harvest.combine.scn -infinder ${genome}.finder.combine.scn -threads ${threads} -u ${substitution_mutations_rate}
29 | 
30 | ## step4 prepare draw files
31 | sed '1d' ${genome}.pass.list | awk -F "[:\t]" '{print $1","$(NF-2)","$NF}' | sed 's/-0/0/g' > LTR_time.csv
32 | 
33 | 
34 | <<draw.R
35 | library("ggplot2")
36 | #读入文件
37 | dat<-read.csv("LTR_time.csv",header=FALSE)
38 | #除以100万
39 | VAF<-dat$V3 / 1000000
40 | 
41 | #画图（出图结果中x轴是以mya（百万年）为单位的。）
42 | ggplot(dat,aes(x=VAF))+geom_density(color = "red")+xlab('Mya')+ylab('Density')+
43 |   scale_x_continuous(expand = c(0, 0)) +
44 |   #scale_y_continuous(expand = c(0, 0))+ #设置横纵坐标从0开始
45 |   theme_classic()
46 | ggsave('Speies.LTR.density.pdf',dpi = 300)
47 | draw.R
48 | 


--------------------------------------------------------------------------------
/deal_gff/pick_longest_gene/pick_longest_ncbi.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/perl
 2 | use strict;
 3 | use warnings;
 4 | 
 5 | my $gff=shift or die "perl $0 <NCBI.gff>";
 6 | my $out="clean.gff";
 7 | 
 8 | my %length;
 9 | my %gene_cds;
10 | open I,"< $gff";
11 | while (<I>) {
12 |     next if(/^#/);
13 |     my @a=split(/\s+/);
14 |     my ($chr,$source,$type,$start,$end,$score,$strand,$phase,$info)=@a;
15 |     if($type eq "CDS"){
16 |         $info=~/Parent=([^;]+)/;
17 |         my $cds_id=$1;
18 |         $length{$cds_id}+=$end-$start+1;
19 |     }
20 |     elsif($type eq "mRNA"){
21 |         $info=~/ID=([^;]+).*Parent=([^;]+)/;
22 |         my ($cds_id,$gene_id)=($1,$2);
23 |         $gene_cds{$gene_id}{$cds_id}=1;
24 |     }
25 | }
26 | close I;
27 | 
28 | foreach my $gene_id(keys %gene_cds){
29 |     foreach my $cds_id(keys %{$gene_cds{$gene_id}}){
30 |         if(exists $length{$cds_id}){
31 |             $gene_cds{$gene_id}{$cds_id}=$length{$cds_id};
32 |         }
33 |         else {
34 |             delete $gene_cds{$gene_id}{$cds_id};
35 |         }
36 |     }
37 | }
38 | 
39 | my %keep;
40 | foreach my $gene_id(sort keys %gene_cds){
41 |     my @protein_id=sort {$gene_cds{$gene_id}{$b} <=> $gene_cds{$gene_id}{$a}} keys %{$gene_cds{$gene_id}};
42 |     my $selected=$protein_id[0];
43 |     # $selected=~s/\.\d$//;
44 |     $keep{$selected}=1;
45 | }
46 | 
47 | open O,"> $out";
48 | open I,"< $gff";
49 | while (<I>) {
50 |     chomp;
51 |     next if(/^#/);
52 |     my @a=split(/\s+/);
53 |     my ($chr,$source,$type,$start,$end,$score,$strand,$phase,$info)=@a;
54 |     if($type eq "CDS"){
55 |         $info=~/Parent=([^;]+)/;
56 |         my $cds_id=$1;
57 |         next unless($keep{$cds_id});
58 |         s/transcript://g;
59 |         print O "$_\n";
60 |     }
61 |     elsif($type eq "mRNA"){
62 |         $info=~/ID=([^;]+).*Parent=([^;]+)/;
63 |         my ($cds_id,$gene_id)=($1,$2);
64 |         next unless($keep{$cds_id});
65 |         s/transcript://g;
66 |         print O "$_\n";
67 |     }
68 | }
69 | close I;
70 | close O;
71 | 


--------------------------------------------------------------------------------
/genome/Genome_error_correction/Pilon&racon.sh:
--------------------------------------------------------------------------------
 1 | 对于三代pacbio初组装的基因组来说，三代测序有一定的错误率，组装出的基因组可能会有错误的区域，这时可以选择用所测的三代测序数据和二代测序数据进行基因组草图纠错。
 2 | 其实在组装时我们也可以用二代和三代数据进行混合组装(等进一步测试后，我会进行补充，目前先不写)
 3 | racon用于三代数据纠错，pilon则是运用二代数据纠错，顺便提一下 nextpoilsh 既可以用三代也可以用二代进行纠错，感觉二代纠错质量不如pilon，这里仁者见仁智者见智。
 4 | 1、软件安装
 5 | --------------------------------------------------
 6 | #这里推荐用mamba直接进行安装
 7 | mamba install pilon
 8 | mamba install racon
 9 | #这里同样需要其他软件：minimap2 、bwa 这里自行安装
10 | --------------------------------------------------
11 | 2、开始进行基因组纠错(每个纠错各需要三轮，先三代后二代)
12 | --------------------------------------------------
13 | #三代第一轮纠错
14 | minimap2  -ax map-pb -t 24 assembly.fa pacbio.read.fasta | gzip -c - > minimap1.sam.gz
15 | racon -t 24 -u pacbio.read.fasta minimap1.sam.gz  assembly.fa >racon1.fasta
16 | #三代第二轮纠错
17 | minimap2  -ax map-pb -t 24 racon1.fasta pacbio.read.fasta | gzip -c - > minimap2.sam.gz
18 | racon -t 24 -u pacbio.read.fasta minimap2.sam.gz racon1.fasta > racon2.fasta
19 | #三代第三轮纠错
20 | minimap2  -ax map-pb -t 24 racon2.fasta pacbio.read.fasta | gzip -c - > minimap3.sam.gz
21 | racon -t 24 -u pacbio.read.fasta minimap3.sam.gz racon2.fasta > racon3.fasta
22 | #二代第一轮纠错
23 | bwa index racon3.fasta
24 | bwa mem -t 24 racon3.fasta illmunia_R1.fq.gz  illmunia_R2.fq.gz |samtools sort -@ 24 - -o bwamem1.bam
25 | pilon  --genome racon3.fasta --frags bwamem1.bam --changes --diploid --outdir ./pilon.out --output pilon1
26 | #二代第二轮纠错
27 | bwa index ./pilon.out/pilon1.fasta 
28 | bwa mem -t 24 ./pilon.out/pilon1.fasta illmunia_R1.fq.gz  illmunia_R2.fq.gz |samtools sort -@ 24 - -o bwamem2.bam
29 | pilon  --genome ./pilon.out/pilon1.fasta --frags bwamem2.bam --changes --diploid --outdir ./pilon.out --output pilon2
30 | #二代第三轮纠错
31 | bwa index ./pilon.out/pilon2.fasta 
32 | bwa mem -t 24 ./pilon.out/pilon2.fasta illmunia_R1.fq.gz  illmunia_R2.fq.gz |samtools sort -@ 24 - -o bwamem3.bam
33 | pilon  --genome ./pilon.out/pilon2.fasta --frags bwamem3.bam --changes --diploid --outdir ./pilon.out --output pilon3
34 | --------------------------------------------------------
35 | 


--------------------------------------------------------------------------------
/deal_fasta/six_frame_translate/translate_seq.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from collections import OrderedDict
 3 | from Bio import SeqIO
 4 | from Bio.Data import CodonTable
 5 | 
 6 | def six_frame_translate(inFa, fout=sys.stdout, seqfmt='fasta', transl_table=1):
 7 | 	d_length = OrderedDict()
 8 | 	for rc in SeqIO.parse(open(inFa), seqfmt):
 9 | 		for seq, suffix0 in zip([rc.seq, rc.seq.reverse_complement()], ['aa', 'rev_aa']):
10 | 			for frame in range(0,3):
11 | 				nucl_seq = seq[frame:]
12 | 				try: aa_seq = translate_seq(nucl_seq, table=transl_table)
13 | 				except CodonTable.TranslationError: continue   # Codon 'XGA' is invalid
14 | 				suffix = '|{}{}'.format(suffix0, frame+1)
15 | 				print('>{}{}\n{}'.format(rc.id, suffix, aa_seq), file=fout)
16 | 		d_length[rc.id] = len(rc.seq)
17 | 	return d_length
18 | def _six_frame_translate(rc, transl_table=1):
19 | 	for seq, suffix0 in zip([rc.seq, rc.seq.reverse_complement()], ['aa', 'rev_aa']):
20 | 		for frame in range(0,3):
21 | 			nucl_seq = seq[frame:]
22 | 			try: aa_seq = translate_seq(nucl_seq, table=transl_table)
23 | 			except CodonTable.TranslationError: continue   # Codon 'XGA' is invalid
24 | 			suffix = '|{}{}'.format(suffix0, frame+1)
25 | 			yield rc.id, suffix, aa_seq
26 | 
27 | def translate_seq(inSeq, **kargs):
28 | 	aa = inSeq.translate(**kargs)
29 | 	return aa
30 | def translate_cds(inSeq, transl_table=1, **kargs):
31 | 	for key in list(kargs.keys()):
32 | 		if not key in {'to_stop', 'stop_symbol', 'gap'}:
33 | 			del kargs[key]
34 | 	try:
35 | 		aa = translate_seq(inSeq, cds=True, table=transl_table, **kargs)
36 | 	except CodonTable.TranslationError as e:
37 | 		aa = translate_seq(inSeq, table=transl_table, **kargs)
38 | 	return aa
39 | 
40 | def main(inFa, outSeq=sys.stdout):
41 | 	for rc in SeqIO.parse(open(inFa), 'fasta'):
42 | 		print('>{}\n{}'.format(rc.id, translate_seq(rc.seq)), file=outSeq)
43 | 
44 | if __name__ == '__main__':
45 | 	import sys
46 | 	inFa = sys.argv[1]
47 | 	if inFa == 'six_frame_translate':
48 | 		inFa = sys.argv[2]
49 | 		six_frame_translate(inFa)
50 | 	else:
51 | 		main(inFa)
52 | (base)
53 | 


--------------------------------------------------------------------------------
/deal_gff/gff.simple/gff.simple.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | ## gff.simple.pl -gff in.gff > out.gff
 3 | use strict;
 4 | use Getopt::Long;
 5 | my ($infile,$tag,$mrna);
 6 | GetOptions(
 7 | 	"gff:s"=>\$infile,
 8 | 	"tag:s"=>\$tag,
 9 | 	"mrna:s"=>\$mrna
10 | 	);
11 | $tag ||= 'ID';
12 | $mrna ||='CDS';
13 | my %CDS;
14 | my %gid;
15 | open IN, "$infile" || die "$!\n";
16 | while(<IN>){
17 | 	chomp;
18 | 	next if (/^#/);
19 | 	next if (/^\s*$/);
20 | 	my @info=split(/\s+/, $_);
21 | 	if ($info[2]=~/$mrna/){
22 | 		if ($info[8]=~/$tag=([^;]+)/){
23 | #			if ($info[9]=~/\"(\S+?)\";/){
24 | 			#if (exists $gid{$1}){
25 | 			#	next;
26 | 			#}else{
27 | 			#	$gid{$1}++;
28 | 			#}
29 | 			my $key=$1;
30 | 			($info[3],$info[4])=($info[4],$info[3]) if ($info[3]>$info[4]);
31 | #			push @{$CDS{$key}}, [@info];
32 | 			push @{$CDS{$key}}, [$info[0],$info[1],$info[2],$info[3],$info[4],$info[5],$info[6],$info[7],$info[9]];
33 | 		}
34 | 	}
35 | }
36 | close IN;
37 | 
38 | foreach my $id (sort keys %CDS){
39 | #	@{$CDS{$id}}=sort {$a->[3] <=> $b->[3]} @{$CDS{$id}};
40 | #	my $ms=$CDS{$id}[0][3];
41 | #	my $me=$CDS{$id}[-1][4];
42 | #	my $strand =$CDS{$id}[0][6];
43 | #	print "$CDS{$id}[0][0]\t$CDS{$id}[0][1]\tmRNA\t$ms\t$me\t\.\t$strand\t\.\tID=$id;\n";
44 | 	my $newc='';
45 | 	if ($CDS{$id}[0][6] eq '+'){
46 | 		@{$CDS{$id}}=sort {$a->[3] <=> $b->[3]} @{$CDS{$id}};
47 | 		#$CDS{$id}[-1][4] +=3;
48 | 		print "$CDS{$id}[0][0]\t$CDS{$id}[0][1]\tmRNA\t$CDS{$id}[0][3]\t $CDS{$id}[-1][4]\t\.\t$CDS{$id}[0][6]\t\.\tID=$id;\n";
49 | 		for (my $i=0; $i<@{$CDS{$id}}; $i++){
50 | 			$CDS{$id}[$i][8]="Parent=$id;";
51 | 			$newc=join "\t", @{$CDS{$id}[$i]};
52 | 			print "$newc\n";
53 | 		}
54 | 	}
55 | 	if ($CDS{$id}[0][6] eq '-'){
56 | 		@{$CDS{$id}}=reverse (sort {$a->[3] <=> $b->[3]} @{$CDS{$id}});
57 | 		#$CDS{$id}[-1][3] -=3;
58 | 		print "$CDS{$id}[0][0]\t$CDS{$id}[0][1]\tmRNA\t$CDS{$id}[-1][3]\t $CDS{$id}[0][4]\t\.\t$CDS{$id}[0][6]\t\.\tID=$id;\n";
59 | 		for (my $i=0; $i<@{$CDS{$id}}; $i++){
60 | 			$CDS{$id}[$i][8]="Parent=$id;";
61 | 			$newc=join "\t", @{$CDS{$id}[$i]};
62 | 			print "$newc\n";
63 | 		}
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/genome/relernn/All.prediction.sh:
--------------------------------------------------------------------------------
 1 | source activate /micromamba/envs/ReLERNN
 2 | SIMULATE="ReLERNN_SIMULATE"
 3 | SIMULATE="ReLERNN_SIMULATE"
 4 | TRAIN="ReLERNN_TRAIN"
 5 | PREDICT="ReLERNN_PREDICT"
 6 | BSCORRECT="ReLERNN_BSCORRECT"
 7 | SEED="42"
 8 | MU="6e-9"
 9 | GENTIME="1"
10 | URTR="1"
11 | DIR="./ABCD_output/"
12 | VCF="./ABCD.vcf"
13 | GENOME="./S_maxi.fa.fai.bed"
14 | CPU="10"
15 | Maxwinsize="500"
16 | Minsites="10"
17 | batchsize="10"
18 | #MASK="./accessibility_mask.bed"
19 | 
20 | ## prepare
21 | #awk '$2>5000000' S_maxi.fa.fai | awk '{print $1"\t0\t"$2}' > S_maxi.fa.fai.bed
22 | bcftools view -S ABCD -m 2 -M 2 S_maxi.SNP.filter.vcf.gz | vcftools --vcf - --maf 0.05 --max-maf 0.95 --max-missing 0.1 --stdout --recode | bcftools annotate --remove QUAL,FILTER,INFO,^FORMAT/GT  | grep -v contig > ABCD.vcf
23 | 
24 | # Simulate data
25 | ${SIMULATE} \
26 |     --vcf ${VCF} \
27 |     --genome ${GENOME} \
28 |     --projectDir ${DIR} \
29 |     --assumedMu ${MU} \
30 |     --upperRhoThetaRatio ${URTR} \
31 |     --nTrain 13000 \
32 |     --nVali 2000 \
33 |     --nTest 100 \
34 |     --forceDiploid  \
35 |     --maxSites ${Maxwinsize} \
36 |     -t ${CPU} \
37 |     --seed ${SEED}
38 | 
39 | # Train network
40 | ${TRAIN} \
41 |     --projectDir ${DIR} \
42 |     --nEpochs 2 \
43 |     --nValSteps 2 \
44 |     -t ${CPU} \
45 |     --seed ${SEED}
46 | 
47 | # Predict
48 | ${PREDICT} \
49 |     --vcf ${VCF} \
50 |     --projectDir ${DIR} \
51 |     --seed ${SEED} \
52 |     --minSites ${Minsites} \
53 |     --batchSizeOverride ${batchsize} \
54 |     --phased
55 | 
56 | # Parametric Bootstrapping
57 | ${BSCORRECT} \
58 |     --projectDir ${DIR} \
59 |     --nSlice 2 \
60 |     --nReps 2 \
61 |     --seed ${SEED} \
62 |     -t ${CPU}
63 | 
64 | ## remove tmp
65 | rm ${VCF}
66 | rsync --delete-before -a /rsync_tmp/ ${DIR}/train/
67 | rsync --delete-before -a rsync_tmp/ ${DIR}/splitVCFs/
68 | rsync --delete-before -a /rsync_tmp/ ${DIR}/vali/
69 | rsync --delete-before -a /rsync_tmp/ ${DIR}/test/
70 | rsync --delete-before -a /rsync_tmp/ ${DIR}/networks/
71 | rm -rf ${DIR}/train/ ${DIR}/splitVCFs/ ${DIR}/vali/ ${DIR}/test/ ${DIR}/networks/
72 | 


--------------------------------------------------------------------------------
/Comparative_genomics/gene_family_cluster/broccoli.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | indir=data
 3 | houzhui=.fa
 4 | cpu=10
 5 | path_fasttree=/00_tools/FastTree
 6 | path_diamond=/00_tools/diamond
 7 | 
 8 | broccoli.py -dir ${indir} -ext ${houzhui} -threads ${cpu} -path_diamond ${path_diamond} -path_fasttree ${path_fasttree}
 9 | 
10 | <<!!!
11 |  general options:
12 |   -steps             steps to be performed, comma separated (default = '1,2,3,4')
13 |   -threads           number of threads [default = 1]
14 |   -h, -help          show this help message and exit
15 | 
16 | 
17 |  STEP 1  kmer clustering:
18 |   -dir               name of the directory containing the proteome files [required]
19 |   -ext               extension of proteome files (default = '.fasta')
20 |   -kmer_size         length of kmers [default = 100]
21 |   -kmer_min_aa       minimum nb of different aa a kmer should have [default = 15]
22 | 
23 |  STEP 2  phylomes:
24 |   -path_diamond      path of DIAMOND with filename [default = 'diamond']
25 |   -path_fasttree     path of FastTree with filename [default = 'fasttree']
26 |   -e_value           e-value for similarity search [default = 0.001]
27 |   -nb_hits           max nb of hits per species [default = 6]
28 |   -max_gap           max fraction of gap per position [default = 0.7]
29 |   -phylogenies       phylogenetic method: 'nj' (neighbor joining), 'me' (minimum evolution) or 'ml' (maximum likelihood) [default = 'nj']
30 | 
31 |  STEP 3  network analysis:
32 |   -sp_overlap        max ratio of overlapping species in phylogenetic trees [default = 0.5]
33 |   -min_weight        min weight for an edge to be kept in the orthology network [default = 0.1]
34 |   -min_nb_hits       spurious hits: min number of hits belonging to the OG [default = 2]
35 |   -chimeric_shared   chimeric prot: min fraction of connected nodes in each OG [default = 0.5]
36 |   -chimeric_nb_sp    chimeric prot: min nb of species in OGs involved in gene-fusions [default = 3]
37 | 
38 |  STEP 4  orthologous pairs:
39 |   -ratio_ortho       limit ratio ortho/total [default = 0.5]
40 |   -not_same_sp       ignore ortho relationships between proteins of the same species (QfO benchmark)
41 | !!!
42 | 


--------------------------------------------------------------------------------
/picture/genome_Circos/circos.conf:
--------------------------------------------------------------------------------
  1 | karyotype = karyotype.txt
  2 | 
  3 | chromosomes_units = 1000000
  4 | 
  5 | <ideogram>
  6 | ## 设定 ideograms 之间的空隙
  7 | <spacing>
  8 | # 设置圈图中染色体之间的空隙大小，以下设置为每个空隙大小为周长的 0.5%
  9 | default = 0.005r
 10 | # 也可以设置指定两条染色体之间的空隙
 11 | <pairwise chr1;chr23>
 12 | spacing = 10u
 13 | </pairwise>
 14 | </spacing>
 15 | # 设定 ideograms 的位置，以下设定 ideograms 在图离圆心的 90% 处
 16 | radius = 0.90r
 17 | # 设定 ideograms 的厚度，可以使用 r（比例关系） 或 p（像素）作为单位
 18 | thickness = 20p
 19 |  # 设定 ideograms 是否填充颜色。填充的颜色取决于 karyotype 指定的文件的最后一列
 20 | fill = yes
 21 | # 设定 ideograms 轮廓的颜色及其厚度。如果没有该参数或设定其厚度为0，则表示没有轮廓
 22 | stroke_color = dgrey
 23 | stroke_thickness = 1p
 24 | ## 设定 label 的显示
 25 | # 设定是否显示label。label对应着karyotype文件的第4列。如果其值为yes，则必须要有label_radius参数来设定label的位置，否则会报错并不能生成结果。
 26 | show_label = yes
 27 | # 设定 label 的字体
 28 | label_font = default
 29 | # 设定 label 的位置
 30 | label_radius = 1r+90p
 31 | # 设定 label 的字体大小
 32 | label_size = 30
 33 | # 设定 label 的字体方向，yes 是易于浏览的方向。
 34 | label_parallel = yes
 35 | </ideogram>
 36 | 
 37 | <<include ticks.conf>>
 38 | 
 39 | <plots>
 40 | 
 41 | <plot>
 42 | type = histogram
 43 | file = DNA_TE_density.txt
 44 | fill_color = 219,105,104
 45 | r1 = 0.98r
 46 | r0 = 0.88r
 47 | </plot>
 48 | 
 49 | <plot>
 50 | type = histogram
 51 | file = LINE_TE_density.txt
 52 | fill_color = 77,151,205
 53 | r1 = 0.88r
 54 | r0 = 0.78r
 55 | </plot>
 56 | 
 57 | <plot>
 58 | type = histogram
 59 | file = SINE_TE_density.txt
 60 | fill_color = 211,161,196
 61 | r1 = 0.78r
 62 | r0 = 0.68r
 63 | </plot>
 64 | 
 65 | <plot>
 66 | type = histogram
 67 | file = LTR_TE_density.txt
 68 | fill_color = 147,204,130
 69 | r1 = 0.68r
 70 | r0 = 0.58r
 71 | </plot>
 72 | 
 73 | <plot>
 74 | type = heatmap
 75 | file = gene_density.txt
 76 | color = oranges-8-seq
 77 | r1 = 0.56r
 78 | r0 = 0.46r
 79 | </plot>
 80 | 
 81 | <plot>
 82 | show = yes
 83 | type = line
 84 | max = 0.5
 85 | min = 0.1
 86 | glyph = rectangle
 87 | glyph_size = 10
 88 | file = GC_content.txt
 89 | r1 = 0.46r
 90 | r0 = 0.26r
 91 | color = red
 92 | stroke_color = dred
 93 | stroke_thickness = 2
 94 | </plot>
 95 | 
 96 | 
 97 | </plots>
 98 | 
 99 | <image>
100 | <<include etc/image.conf>>
101 | </image>
102 | <<include etc/colors_fonts_patterns.conf>>
103 | <<include etc/housekeeping.conf>>
104 | 


--------------------------------------------------------------------------------
/Comparative_genomics/kaks/genelist_kaks.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ $# == '0' ]]; then
 4 |     echo "             This shell for calculate Ka/Ks by a given gene list"
 5 |     echo "usage:      sh genelist_kaks.sh genelist cpu outdir_tmp all.pep all.cds"
 6 |     echo " "
 7 |     echo "example:    sh genelist_kaks.sh tlr9_gene 6 test_dir all.pep all.cds "
 8 |     echo " "
 9 |     echo "attention:  1. The outdir_tmp is automatically generated! "
10 |     echo "            2. Change different outdir_tmp name before using it !!! "
11 |     exit 1
12 | fi
13 | 
14 | ### change parameter name
15 | GO_term=$1
16 | cpu=$2
17 | outdir_tmp=$3
18 | all_pep=$4
19 | all_cds=$5
20 | 
21 | ### software
22 | seqtk=/bin/seqtk
23 | ParaAT=/bin/ParaAT.pl
24 | ${blast_shell}=blast.sh
25 | 
26 | ### step1: get gene list
27 | cp ${GO_term} ${GO_term}.all.gene.list
28 | 
29 | ### step2: get cds & pep
30 | ${seqtk} subseq ${all_cds} ${GO_term}.all.gene.list > ${GO_term}.all.gene.list.cds
31 | ${seqtk} subseq ${all_pep} ${GO_term}.all.gene.list > ${GO_term}.all.gene.list.pep
32 | 
33 | ### step3: blastn and get gene pair
34 | sh  ${GO_term}.all.gene.list.cds ${GO_term}.all.gene.list.cds nucl blastn result.txt ${cpu}
35 | grep -v "#" result.txt | awk '$9/$8>0.6||$9/$7>0.6' | cut -f 1,4 | awk '$1!=$2' | awk '!a[$1,$2] && !a[$2,$1]++' > ${GO_term}.gene.pair
36 | 
37 | ### step4: make cpu file
38 | echo ${cpu} > procpu
39 | 
40 | ### step5: calculate Ka/Ks
41 | ${ParaAT} -h ${GO_term}.gene.pair -n ${GO_term}.all.gene.list.cds -a ${GO_term}.all.gene.list.pep -m clustalw2 -p procpu -f axt -g -k -o ${outdir_tmp}
42 | 
43 | ### step6: deal output
44 | cat ./${outdir_tmp}/*kaks |awk 'NR==1;NR>=1 { print $0| "grep -v Sequence"}' > ${GO_term}.all.kaks.result.xls
45 | #less all.kaks.result.xls  |cut -f 5|grep -v 'NA' > kaks.list
46 | 
47 | ### step7: remove tmpfile
48 | rm -rf *all.gene.list* blastdb procpu ${outdir_tmp} result.txt
49 | 
50 | ### ParaAT.pl help
51 | # ParaAT.pl -h test.homologs -n test.cds -a test.pep -p proc -o output -f axt
52 | #--------------------------------
53 | #-h, 指定同源基因列表文件
54 | #-n, 指定核酸序列文件
55 | #-a, 指定蛋白序列文件
56 | #-p, 指定多线程文件                      ## 文件中给定线程数，默认为6
57 | #-m, 指定比对工具                        ## muscle
58 | #-g, 去除比对有gap的密码子
59 | #-k, 用KaKs_Calculator                   ## 计算kaks值
60 | #-o, 输出结果的目录
61 | #-f, 输出比对文件的格式
62 | 


--------------------------------------------------------------------------------
/deal_fasta/agp2fa/agp2fa.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | 
 4 | # parse input options
 5 | if (not $ARGV[0] or $ARGV[0] eq "-h") {
 6 |     die "
 7 |     Description: read AGP produced by cluster2agp.pl and output the chromosome-level scaffold sequences in multi-line FASTA format.
 8 | 
 9 |     Author: Sen Wang, wangsen1993@163.com, 2021/7/26.
10 |     Usage: perl agp2fasta.pl gfa.cluster.agp contigs.fasta > gfa.cluster.agp.fasta
11 |     \n";
12 | }
13 | 
14 | # read cluster.agp
15 | my (%scaffold, %contig);
16 | open IN, "<$ARGV[0]" or die "Cannot open $ARGV[0]!\n";
17 | while (<IN>) {
18 |     chomp;
19 |     my @f = split(/\t/, $_);
20 |     if ($f[0] ne $f[5]) {
21 |         push @{$scaffold{$f[0]}}, "$f[5]$f[8]";
22 |     } else {
23 |         $contig{$f[0]} = $f[5];
24 |     }
25 | }
26 | close IN;
27 | 
28 | # read contigs.fasta
29 | my %seqs;
30 | my $header;
31 | open IN, "<$ARGV[1]" or die "Cannot open $ARGV[1]!\n";
32 | while (<IN>) {
33 |     chomp;
34 |     if (/^>(\w+)/) {
35 |         $header = $1;
36 |     } else {
37 |         $seqs{$header} .= $_;
38 |     }
39 | }
40 | close IN;
41 | 
42 | # output chromosome-level scaffold sequences
43 | foreach my $s (sort keys %scaffold) {
44 |     print ">$s\n";
45 |     my $seq = "";
46 |     foreach my $c (@{$scaffold{$s}}) {
47 |         my $ctg = substr($c, 0, length($c) - 1);
48 |         my $strand = substr($c, -1, 1);
49 |         if ($strand eq '+') {
50 |             die "Cannot get the sequence of $ctg! check $ARGV[1]!\n" if not $seqs{$ctg};
51 |             $seq .= $seqs{$ctg};
52 |         } elsif ($strand eq '-') {
53 |             die "Cannot get the sequence of $ctg! check $ARGV[1]!\n" if not $seqs{$ctg};
54 |             my $tem = $seqs{$ctg};
55 |             $tem = reverse($tem);
56 |             $tem =~ tr/ATCG/TAGC/;
57 |             $seq .= $tem;
58 |         } else {
59 |             $ctg =~ /(\d+)/;
60 |             $seq .= "N" x $1;
61 |         }
62 |     }
63 |     for (my $i = 0; $i < length($seq); $i += 60) {
64 |         my $sub = substr($seq, $i, 60);
65 |         print "$sub\n";
66 |     }
67 | }
68 | foreach my $c (sort keys %contig) {
69 |     print ">$c\n";
70 |     my $seq = $seqs{$c};
71 |     die "Cannot get the sequence of $c! check $ARGV[1]!\n" if not $seqs{$c};
72 |     for (my $i = 0; $i < length($seq); $i += 60) {
73 |         my $sub = substr($seq, $i, 60);
74 |         print "$sub\n";
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/Comparative_genomics/kaks/go_kaks.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ $# == '0' ]]; then
 4 |     echo "             This shell for select gene in GO_term and calculate Ka/Ks"
 5 |     echo "usage:      sh go_kaks.sh GO_term cpu outdir_tmp all_go_term_file all.pep all.cds"
 6 |     echo " "
 7 |     echo "example:    sh go_kaks.sh GO_1990452 6 test_dir all.difgoall all.pep all.cds "
 8 |     echo " "
 9 |     echo "attention:  1. The outdir_tmp is automatically generated! "
10 |     echo "            2. Change different outdir_tmp name before using it !!! "
11 |     exit 1
12 | fi
13 | 
14 | ### change parameter name
15 | GO_term=$1
16 | cpu=$2
17 | outdir_tmp=$3
18 | all_go_term_file=$4
19 | all_pep=$5
20 | all_cds=$6
21 | 
22 | ### software
23 | seqtk=/bin/seqtk
24 | ParaAT=/bin/ParaAT.pl
25 | blast_shell=blast.sh
26 | 
27 | ### step1: get gene list
28 | grep -w ${GO_term} ${all_go_term_file} | sort | awk '{print $NF}' | sed 's/,/\n/g' > ${GO_term}.all.gene.list
29 | 
30 | ### step2: get cds & pep
31 | ${seqtk} subseq ${all_cds} ${GO_term}.all.gene.list > ${GO_term}.all.gene.list.cds
32 | ${seqtk} subseq ${all_pep} ${GO_term}.all.gene.list > ${GO_term}.all.gene.list.pep
33 | 
34 | ### step3: blastn and get gene pair
35 | sh ${blast_shell} ${GO_term}.all.gene.list.cds ${GO_term}.all.gene.list.cds nucl blastn result.txt ${cpu}
36 | grep -v "#" result.txt | awk '$9/$8>0.6||$9/$7>0.6' | cut -f 1,4 | awk '$1!=$2' | awk '!a[$1,$2] && !a[$2,$1]++' > ${GO_term}.gene.pair
37 | 
38 | ### step4: make cpu file
39 | echo ${cpu} > procpu
40 | 
41 | ### step5: calculate Ka/Ks
42 | ${ParaAT} -h ${GO_term}.gene.pair -n ${GO_term}.all.gene.list.cds -a ${GO_term}.all.gene.list.pep -m clustalw2 -p procpu -f axt -g -k -o ${outdir_tmp}
43 | 
44 | ### step6: deal output
45 | cat ./${outdir_tmp}/*kaks |awk 'NR==1;NR>=1 { print $0| "grep -v Sequence"}' > ${GO_term}.all.kaks.result.xls
46 | #less all.kaks.result.xls  |cut -f 5|grep -v 'NA' > kaks.list
47 | 
48 | ### step7: remove tmpfile
49 | rm -rf *all.gene.list* blastdb procpu ${outdir_tmp} result.txt
50 | 
51 | ### ParaAT.pl help
52 | # ParaAT.pl -h test.homologs -n test.cds -a test.pep -p proc -o output -f axt
53 | #--------------------------------
54 | #-h, 指定同源基因列表文件
55 | #-n, 指定核酸序列文件
56 | #-a, 指定蛋白序列文件
57 | #-p, 指定多线程文件                      ## 文件中给定线程数，默认为6
58 | #-m, 指定比对工具                        ## muscle
59 | #-g, 去除比对有gap的密码子
60 | #-k, 用KaKs_Calculator                   ## 计算kaks值
61 | #-o, 输出结果的目录
62 | #-f, 输出比对文件的格式
63 | 


--------------------------------------------------------------------------------
/picture/genome_Circos/fast_Circos.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set +o posix
 3 | 
 4 | genome=Stichopus_variegatus.fa
 5 | bgi_gff=Stichopus_variegatus.bgi.gff
 6 | chr_id=chr.id
 7 | repeatmask_out=HiTE.update.out
 8 | 
 9 | ## get chromosome information
10 | seqtk subseq ${genome} ${chr_id} | seqkit seq -w 100 > tmp_chr.fa
11 | fishInWinter.pl -bf table -ff table ${chr_id} ${bgi_gff} > tmp_chr.gff
12 | 
13 | ## stat
14 | iTools Fatools stat -InPut ${genome} -OutPut ${genome}.chrlist
15 | grep -v "#" ${genome}.chrlist | grep chr | awk '{print "chr - "$1" "$1" 0 "$2" "$1}' > karyotype.txt
16 | awk '$3=="mRNA"' tmp_chr.gff | awk '{print $1"\t"$4"\t"$5}' | sort -k 1V,1 -k 2n,2 > gene.bed
17 | cut -f 3,6 -d " " karyotype.txt | awk '{print $1"\t"$2}' > chr.length
18 | 
19 | ## make windows (no more 2500 windows)
20 | bedtools makewindows -g chr.length -n 70 > chr.window
21 | 
22 | ## stat
23 | bedtools coverage -a chr.window -b gene.bed | cut -f 1-4 | sort -k 1V,1 -k 2n,2 > gene_density.txt
24 | bedtools nuc -fi tmp_chr.fa -bed chr.window | cut -f 1,2,3,5 | sed '1d' > GC_content.txt
25 | 
26 | ## deal TE
27 | grep "DNA/" ${repeatmask_out} | awk '{print $5"\t"$6"\t"$7}' | fishInWinter.pl -bf table -ff table ${chr_id} - |sort -k 1V,1 -k 2n,2 | bedtools merge -i - > DNA.TE.bed
28 | grep "LINE/" ${repeatmask_out} | awk '{print $5"\t"$6"\t"$7}' | fishInWinter.pl -bf table -ff table ${chr_id} - |sort -k 1V,1 -k 2n,2 | bedtools merge -i - > LINE.TE.bed
29 | grep "SINE/" ${repeatmask_out} | awk '{print $5"\t"$6"\t"$7}' | fishInWinter.pl -bf table -ff table ${chr_id} - |sort -k 1V,1 -k 2n,2 | bedtools merge -i - > SINE.TE.bed
30 | grep "LTR/" ${repeatmask_out} | awk '{print $5"\t"$6"\t"$7}' | fishInWinter.pl -bf table -ff table ${chr_id} - |sort -k 1V,1 -k 2n,2 | bedtools merge -i - > LTR.TE.bed
31 | bedtools coverage -a chr.window -b DNA.TE.bed | cut -f 1-4 | sort -k 1V,1 -k 2n,2 > DNA_TE_density.txt
32 | bedtools coverage -a chr.window -b LINE.TE.bed | cut -f 1-4 | sort -k 1V,1 -k 2n,2 > LINE_TE_density.txt
33 | bedtools coverage -a chr.window -b SINE.TE.bed | cut -f 1-4 | sort -k 1V,1 -k 2n,2 > SINE_TE_density.txt
34 | bedtools coverage -a chr.window -b LTR.TE.bed | cut -f 1-4 | sort -k 1V,1 -k 2n,2 > LTR_TE_density.txt
35 | 
36 | rm tmp_chr.fa tmp_chr.gff DNA.TE.bed LINE.TE.bed SINE.TE.bed LTR.TE.bed *.fai
37 | 
38 | ## circos
39 | cp /02_pilple/circos/circos.conf .
40 | cp /02_pilple/circos/ticks.conf .
41 | 
42 | /usr/bin/singularity exec --bind $PWD:$PWD /01_soft/singularity_all/circos.sif circos -conf circos.conf
43 | 


--------------------------------------------------------------------------------
/python/zscore/zscore2.py:
--------------------------------------------------------------------------------
 1 | # Usage: python zscore_large_file_parallel.py input_file output_file num_threads chunksize
 2 | 
 3 | import sys
 4 | import pandas as pd
 5 | import numpy as np
 6 | from concurrent.futures import ThreadPoolExecutor
 7 | 
 8 | def calculate_zscore(values, mean, std):
 9 |     """计算 Z-score"""
10 |     return (values - mean) / std
11 | 
12 | def process_chunk(chunk, mean, std, last_column_name):
13 |     """处理单个分块，计算 Z-score 并新增列"""
14 |     chunk[f"{last_column_name}_zscore"] = calculate_zscore(chunk[last_column_name].to_numpy(dtype=float), mean, std)
15 |     return chunk
16 | 
17 | def calculate_zscore_in_chunks_parallel(input_file, output_file, num_threads, chunksize):
18 |     # 首先计算最后一列的全局均值和标准差（两次遍历文件）
19 |     total_sum, total_sq_sum, total_count = 0, 0, 0
20 |     last_column_name = None
21 | 
22 |     # 第一次遍历：计算全局均值和标准差
23 |     for chunk in pd.read_csv(input_file, sep='\t', chunksize=chunksize):
24 |         if last_column_name is None:
25 |             last_column_name = chunk.columns[-1]
26 |         last_column_values = chunk[last_column_name].to_numpy(dtype=float)  # 提取最后一列为 NumPy 数组
27 |         total_sum += np.sum(last_column_values)
28 |         total_sq_sum += np.sum(last_column_values**2)
29 |         total_count += len(last_column_values)
30 | 
31 |     # 计算全局均值和标准差
32 |     mean = total_sum / total_count
33 |     std = np.sqrt(total_sq_sum / total_count - mean**2)
34 | 
35 |     # 第二次遍历：多线程并行计算 Z-score，并写入文件
36 |     with open(output_file, 'w') as f_out:
37 |         header_written = False  # 用于控制是否写入表头
38 |         with ThreadPoolExecutor(max_workers=num_threads) as executor:
39 |             for chunk in pd.read_csv(input_file, sep='\t', chunksize=chunksize):
40 |                 # 提交任务到线程池，处理单个分块
41 |                 future = executor.submit(process_chunk, chunk, mean, std, last_column_name)
42 |                 processed_chunk = future.result()
43 | 
44 |                 # 写入文件，第一次写入表头，后续不再写入
45 |                 processed_chunk.to_csv(f_out, sep='\t', index=False, header=not header_written)
46 |                 header_written = True  # 写入一次表头后设置为 True
47 | 
48 | if __name__ == "__main__":
49 |     # 获取命令行参数
50 |     if len(sys.argv) < 5:
51 |         print("Usage: python zscore_large_file_parallel.py <input_file> <output_file> <num_threads> <chunksize>")
52 |         sys.exit(1)
53 | 
54 |     input_file = sys.argv[1]
55 |     output_file = sys.argv[2]
56 |     num_threads = int(sys.argv[3])  # 自定义线程数
57 |     chunksize = int(sys.argv[4])  # 自定义分块大小
58 | 
59 |     # 调用主函数
60 |     calculate_zscore_in_chunks_parallel(input_file, output_file, num_threads, chunksize)
61 | 


--------------------------------------------------------------------------------
/picture/syri_plotsv/syri_plotsv.sh:
--------------------------------------------------------------------------------
 1 | ## https://github.com/nschan/nf-plotsv 
 2 | 
 3 | ## Use relative paths
 4 | ref_genome=final-father-chr-mela.fa
 5 | ref_genome_chr_num=24
 6 | query_genome=final-mother-chr-mela.fa
 7 | query_genome_chr_num=24
 8 | cpu_for_minimap=20
 9 | mem_for_minimap=100
10 | plot_all_chr_num=24
11 | plotsr_Space_for_homologous_chromosome=0.7
12 | plotsr_height=10
13 | plotsr_width=7
14 | plotsr_font_size=8
15 | plotsr_minimum_size_of_SR_to_be_plotted=5000
16 | 
17 | export PATH="/00_tools/:$PATH"
18 | 
19 | ## rename chromesome
20 | iTools Fatools stat -InPut ${ref_genome} -OutPut ${ref_genome}.chrlen
21 | iTools Fatools stat -InPut ${query_genome} -OutPut ${query_genome}.chrlen
22 | grep -v "#" ${ref_genome}.chrlen | sort -k 2nr,2 | head -n ${ref_genome_chr_num} | awk '{print $1}' | seqtk subseq ${ref_genome} - | seqtk rename - Chr | seqkit seq -w 100 - > ref.rename.fa
23 | grep -v "#" ${query_genome}.chrlen | sort -k 2nr,2 | head -n ${query_genome_chr_num} | awk '{print $1}' | seqtk subseq ${query_genome} - | seqtk rename - Chr | seqkit seq -w 100 - > query.rename.fa
24 | 
25 | ## make samplesheet
26 | echo "name,fasta" >> samplesheet.csv
27 | ## the names also be used in the final picture.
28 | echo "ref,$PWD/ref.rename.fa" >> samplesheet.csv
29 | echo "query,$PWD/query.rename.fa" >> samplesheet.csv
30 | 
31 | ## run
32 | cp /01_soft/nf-plotsv/configs/base.demo.config base.config
33 | sed -i "s/ABCD/${cpu_for_minimap}/g;s/EFGH/${mem_for_minimap}/g" base.config
34 | nextflow -config $PWD/base.config run /01_soft/nf-plotsv --samplesheet samplesheet.csv -profile local --reference ref_genome --ref_genome $PWD/ref.rename.fa --subset_pattern Chr[1-9] --reorient true
35 | 
36 | ## re-draw
37 | cp work/*/*/plotsr_infile.tsv plotsv/syri_pairwise/plotsr_infile.tsv
38 | cp work/*/*/files.txt plotsv/syri_pairwise/files.txt
39 | cp /nf-plotsv/assets/plotsr_config.conf plotsv/syri_pairwise/plotsr_config.conf
40 | for i in `seq 1 ${plot_all_chr_num}` ; do echo Chr${i} >> plotsv/syri_pairwise/chr.order ; done
41 | 
42 | cd plotsv/syri_pairwise/
43 | cp ../align_pairwise/*fa ./
44 | sr=$(cat files.txt)
45 | singularity run /01_soft/singularity_all/fixchr-syri-plotsr.sif plotsr --genomes plotsr_infile.tsv ${sr} --cfg plotsr_config.conf -o replot.pdf -S ${plotsr_Space_for_homologous_chromosome} -W ${plotsr_width} -H ${plotsr_height} -f ${plotsr_font_size} -s ${plotsr_minimum_size_of_SR_to_be_plotted} --chrord chr.order
46 | pigz --best -p ${cpu_for_minimap} *syri.out
47 | pigz --best -p ${cpu_for_minimap} *syri.vcf
48 | pigz --best -p ${cpu_for_minimap} *.fa
49 | cd ../../
50 | 
51 | ## rm tmp
52 | rm -rf .nextflow* work ref.rename.fa query.rename.fa *chrlen *.chrlist  samplesheet.csv base.config plotsv/prepare_genomes plotsv/align_pairwise
53 | 


--------------------------------------------------------------------------------
/Comparative_genomics/blast/diamond_rbh.R:
--------------------------------------------------------------------------------
 1 | 
 2 | library(optparse)
 3 | option_list <- list(
 4 |     make_option(c("-v", "--info"), type = "character", default=F, metavar="info",
 5 |                 help="The firt version to find Reciprocal best hit using diamond.\n\t\t!!! Example: diamond_rbh.R -a a.pep -b b.pep -c 10 -e 1E-3 -m 1 -M ultra-sensitive !!!"),
 6 |     make_option(c("-a", "--speA"), type = "character", default=NULL, metavar="pepA.fa",
 7 |                 help="pep.fasta of Species A"),
 8 |     make_option(c("-b", "--speB"), type="character", default=NULL, metavar="pepB.fa",
 9 |                 help="pep.fasta of Species B"),
10 |     make_option(c("-o", "--output"), type="character", default='output.csv', metavar="output",
11 |                 help="RBH result"),
12 |     make_option(c("-c", "--cpu"), type="integer", default=1, metavar="Number",
13 |                 help="cpu number. defult: 1"),
14 |     make_option(c("-e", "--evalue"), default=1E-3, metavar="Number",
15 |                 help="Expectation value. defult: 1E-3"),
16 |     make_option(c("-m", "--max"), type="integer", default=1, metavar="Number",
17 |                 help="maximum number of aligned sequences that shall be retained. defult: 1"),
18 |     make_option(c("-M", "--model"), default='ultra-sensitive', metavar="aligned model",
19 |                 help="sensitivity_mode: defult: ultra-sensitive
20 |                       fast : fastest alignment mode, but least sensitive (default). Designed for finding hits of >70.
21 |                       mid-sensitive : fast alignments between the fast mode and the sensitive mode in sensitivity.
22 |                       sensitive : fast alignments, but full sensitivity for hits >40.
23 |                       more-sensitive : more sensitive than the sensitive mode.
24 |                       very-sensitive : sensitive alignment mode.
25 |                       ultra-sensitive : most sensitive alignment mode (sensitivity as high as BLASTP).")
26 | )
27 | 
28 | opt_parser = OptionParser(option_list=option_list);
29 | opt = parse_args(opt_parser);
30 | 
31 | if (is.null(opt$speA)){
32 |   print_help(opt_parser)}
33 | 
34 | speA = opt$speA
35 | speB = opt$speB
36 | cpu = opt$cpu
37 | evalue = opt$evalue
38 | max = opt$max
39 | model = opt$model
40 | output = opt$output
41 | 
42 | library("homologr")
43 | 
44 | rec_best_hits <- diamond_reciprocal_best_hits(
45 |   query = speA,
46 |   subject = speB,
47 |   is_subject_db = FALSE,
48 |   format = "fasta",
49 |   sensitivity_mode = model,
50 |   out_format = "csv",
51 |   evalue = evalue,
52 |   max_target_seqs = max,
53 |   cores = cpu,
54 |   hard_mask = TRUE,
55 |   diamond_exec_path = "",
56 |   add_makedb_options = NULL,
57 |   add_diamond_options = NULL,
58 |   output_path = getwd()
59 | )
60 | 
61 | write.csv(rec_best_hits,file=output,quote=F,row.names = F)
62 | 
63 | 


--------------------------------------------------------------------------------
/deal_gff/gff.simple/EVMtoBGI.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from collections import OrderedDict
 3 | from types import SimpleNamespace
 4 | 
 5 | class Record(object):
 6 |     def __init__(self, line):
 7 |         super(Record, self).__init__()
 8 |         self.record = line.strip('\n')
 9 |         lst = self.record.split('\t')
10 |         self.nonA = lst[:-1]
11 |         self.length = abs(int(lst[3]) - int(lst[4])) + 1
12 |         self.feature = lst[2]
13 |         if self.feature in ['transcript', 'primary_transcript']:
14 |             self.feature = 'mRNA'
15 |         attribute = {k:v for k, v in [x.split('=') for x in lst[8].split(';') if x]}
16 |         attribute.update({'record':lst[8]})
17 |         self.attribute = SimpleNamespace(**attribute)
18 |     def __str__(self):
19 |         if self.feature == 'mRNA':
20 |             attribute = {'record':'='.join(['ID', self.attribute.ID]),
21 |                     'ID':self.attribute.ID}
22 |         elif self.feature == 'CDS':
23 |             attribute = {'record':'='.join(['Parent', self.attribute.Parent]),
24 |                     'Parent':self.attribute.Parent}
25 |         self.attribute = SimpleNamespace(**attribute)
26 |         formal = self.nonA + [self.attribute.record + ';\n']
27 |         return '\t'.join(formal)
28 | 
29 | def makedict(d, k, v):
30 |     if d.get(k):
31 |         d[k].append(v)
32 |     else:
33 |         d[k] = [v]
34 |     return d
35 | 
36 | def getbest(infile):
37 |     gene = OrderedDict()
38 |     cds = OrderedDict()
39 |     with open(infile) as r:
40 |         for line in r:
41 |             if not line.startswith("#") and not line.startswith("\n"):
42 |                 r = Record(line)
43 |                 if r.feature == 'mRNA':
44 |                     gene = makedict(gene, r.attribute.Parent, r)
45 |                 elif r.feature == 'CDS':
46 |                     cds = makedict(cds, r.attribute.Parent, r)
47 | 
48 |     best = OrderedDict()
49 |     for geneID, mrnas in gene.items():
50 |         cdslen = [(x, sum([c.length for c in cds[x.attribute.ID]])) for x in mrnas if cds.get(x.attribute.ID)]
51 |         mrnas = sorted(cdslen, key=lambda x:x[1], reverse=True)
52 |         #mrnas = sorted(mrnas, key=lambda x:x.length, reverse=True)
53 |         if not mrnas:
54 |             continue
55 |         best[geneID] = mrnas[0][0]
56 |     return best, cds
57 | 
58 | if __name__ == "__main__":
59 |     if len(sys.argv[1:]) != 1:
60 |         sys.stderr.write('usage: python {} ingff > outgff\n'.format(__file__))
61 |         sys.exit()
62 |     else:
63 |         infile = sys.argv[1]
64 |         best, cds = getbest(infile)
65 |         #print(len(best))
66 |         for geneID, mrna in best.items():
67 |             sys.stdout.write(str(mrna))
68 |             if cds.get(mrna.attribute.ID):
69 |                 children = cds[mrna.attribute.ID]
70 |                 for child in children:
71 |                     sys.stdout.write(str(child))
72 | 


--------------------------------------------------------------------------------
/other/count_directory_num_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Define the output file
 4 | output_file="count_size_dir.txt"
 5 | 
 6 | # Define the maximum depth (modifiable variable)
 7 | max_depth=6
 8 | 
 9 | # Clear or create the output file
10 | > "$output_file"
11 | 
12 | echo "Counting files and their sizes in KB (including hidden files and links, ensuring recursive accumulation beyond max depth) up to depth $max_depth..."
13 | 
14 | # Function to recursively count files and aggregate counts and sizes up to the parent
15 | function count_files_and_sizes {
16 |   local dir="$1"
17 |   local current_depth="$2"
18 | 
19 |   # Initialize counters for this directory
20 |   local file_count=0
21 |   local total_size=0
22 | 
23 |   # Count files and their sizes directly in this directory
24 |   local direct_file_count=$(find "$dir" -maxdepth 1 \( -type f -o -type l \) | wc -l)
25 |   local direct_size=$(find "$dir" -maxdepth 1 \( -type f -o -type l \) -exec du -b {} + | awk '{sum += $1} END {print sum}')
26 |   direct_size=${direct_size:-0}
27 | 
28 |   # Add direct counts and sizes to the totals
29 |   file_count=$((file_count + direct_file_count))
30 |   total_size=$((total_size + direct_size))
31 | 
32 |   # If current depth is less than or equal to max depth, process subdirectories
33 |   if [ "$current_depth" -lt "$max_depth" ]; then
34 |     for subdir in "$dir"/* "$dir"/.*; do
35 |       if [ -d "$subdir" ] && [ "$subdir" != "$dir/." ] && [ "$subdir" != "$dir/.." ] && [ ! -L "$subdir" ]; then
36 |         # Recursively get file counts and sizes from subdirectories
37 |         local sub_count_and_size=$(count_files_and_sizes "$subdir" $((current_depth + 1)))
38 |         local sub_count=$(echo "$sub_count_and_size" | awk '{print $1}')
39 |         local sub_size=$(echo "$sub_count_and_size" | awk '{print $2}')
40 |         file_count=$((file_count + sub_count))
41 |         total_size=$((total_size + sub_size))
42 |       fi
43 |     done
44 |   elif [ "$current_depth" -eq "$max_depth" ]; then
45 |     # If at max depth, include all files recursively from this point
46 |     local deeper_count=$(find "$dir" -type f -o -type l | wc -l)
47 |     local deeper_size=$(find "$dir" -type f -o -type l -exec du -b {} + | awk '{sum += $1} END {print sum}')
48 |     deeper_size=${deeper_size:-0}
49 |     file_count=$((file_count + deeper_count))
50 |     total_size=$((total_size + deeper_size))
51 |   fi
52 | 
53 |   # Convert size to KB
54 |   local total_size_mb=$(echo "scale=2; $total_size / 1024" | bc)
55 | 
56 |   # Output this directory
57 |   local abs_path=$(realpath "$dir")
58 |   echo -e "$file_count\t$total_size_mb\t$abs_path" >> "$output_file"
59 | 
60 |   # Return the file count and total size for this directory
61 |   echo "$file_count $total_size"
62 | }
63 | 
64 | # Start counting from the current directory with an initial depth of 1
65 | count_files_and_sizes "." 1
66 | 
67 | echo "Counting completed. Results are saved in $output_file."
68 | 


--------------------------------------------------------------------------------
/genome/Anno_EGAPx/toBGI.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | from collections import OrderedDict
 5 | from types import SimpleNamespace
 6 | 
 7 | class Record(object):
 8 |     def __init__(self, line):
 9 |         super(Record, self).__init__()
10 |         self.record = line.strip('\n')
11 |         lst = self.record.split('\t')
12 |         self.nonA = lst[:-1]
13 |         self.length = abs(int(lst[3]) - int(lst[4])) + 1
14 |         self.feature = lst[2]
15 |         if self.feature in ['transcript', 'primary_transcript']:
16 |             self.feature = 'mRNA'
17 |         attribute = {k:v for k, v in [x.split('=') for x in lst[8].split(';') if x]}
18 |         attribute.update({'record':lst[8]})
19 |         self.attribute = SimpleNamespace(**attribute)
20 |     def __str__(self):
21 |         if self.feature == 'mRNA':
22 |             attribute = {'record':'='.join(['ID', self.attribute.ID]), 
23 |                     'ID':self.attribute.ID}
24 |         elif self.feature == 'CDS':
25 |             attribute = {'record':'='.join(['Parent', self.attribute.Parent]), 
26 |                     'Parent':self.attribute.Parent}
27 |         self.attribute = SimpleNamespace(**attribute)
28 |         formal = self.nonA + [self.attribute.record + ';\n']
29 |         return '\t'.join(formal)
30 | 
31 | def makedict(d, k, v):
32 |     if d.get(k):
33 |         d[k].append(v)
34 |     else:
35 |         d[k] = [v]
36 |     return d
37 | 
38 | def getbest(infile):
39 |     gene = OrderedDict()
40 |     cds = OrderedDict()
41 |     with open(infile) as r:
42 |         for line in r:
43 |             if not line.startswith("#") and not line.startswith("\n"):            
44 |                 r = Record(line)
45 |                 if r.feature == 'mRNA':
46 |                     gene = makedict(gene, r.attribute.Parent, r)
47 |                 elif r.feature == 'CDS':
48 |                     cds = makedict(cds, r.attribute.Parent, r)
49 | 
50 |     best = OrderedDict()
51 |     for geneID, mrnas in gene.items():
52 |         cdslen = [(x, sum([c.length for c in cds[x.attribute.ID]])) for x in mrnas if cds.get(x.attribute.ID)]
53 |         mrnas = sorted(cdslen, key=lambda x:x[1], reverse=True)
54 |         #mrnas = sorted(mrnas, key=lambda x:x.length, reverse=True)
55 |         if not mrnas:
56 |             continue
57 |         best[geneID] = mrnas[0][0]
58 |     return best, cds
59 | 
60 | if __name__ == "__main__":
61 |     if len(sys.argv[1:]) != 1:
62 |         sys.stderr.write('usage: python {} ingff > outgff\n'.format(__file__))
63 |         sys.exit()
64 |     else:
65 |         infile = sys.argv[1]
66 |         best, cds = getbest(infile)
67 |         #print(len(best))
68 |         for geneID, mrna in best.items():
69 |             sys.stdout.write(str(mrna))
70 |             if cds.get(mrna.attribute.ID):
71 |                 children = cds[mrna.attribute.ID]
72 |                 for child in children:
73 |                     sys.stdout.write(str(child))
74 | 
75 | 


--------------------------------------------------------------------------------
/deal_gff/pick_longest_gene/toBGI.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | from collections import OrderedDict
 5 | from types import SimpleNamespace
 6 | 
 7 | class Record(object):
 8 |     def __init__(self, line):
 9 |         super(Record, self).__init__()
10 |         self.record = line.strip('\n')
11 |         lst = self.record.split('\t')
12 |         self.nonA = lst[:-1]
13 |         self.length = abs(int(lst[3]) - int(lst[4])) + 1
14 |         self.feature = lst[2]
15 |         if self.feature in ['transcript', 'primary_transcript']:
16 |             self.feature = 'mRNA'
17 |         attribute = {k:v for k, v in [x.split('=') for x in lst[8].split(';') if x]}
18 |         attribute.update({'record':lst[8]})
19 |         self.attribute = SimpleNamespace(**attribute)
20 |     def __str__(self):
21 |         if self.feature == 'mRNA':
22 |             attribute = {'record':'='.join(['ID', self.attribute.ID]), 
23 |                     'ID':self.attribute.ID}
24 |         elif self.feature == 'CDS':
25 |             attribute = {'record':'='.join(['Parent', self.attribute.Parent]), 
26 |                     'Parent':self.attribute.Parent}
27 |         self.attribute = SimpleNamespace(**attribute)
28 |         formal = self.nonA + [self.attribute.record + ';\n']
29 |         return '\t'.join(formal)
30 | 
31 | def makedict(d, k, v):
32 |     if d.get(k):
33 |         d[k].append(v)
34 |     else:
35 |         d[k] = [v]
36 |     return d
37 | 
38 | def getbest(infile):
39 |     gene = OrderedDict()
40 |     cds = OrderedDict()
41 |     with open(infile) as r:
42 |         for line in r:
43 |             if not line.startswith("#") and not line.startswith("\n"):            
44 |                 r = Record(line)
45 |                 if r.feature == 'mRNA':
46 |                     gene = makedict(gene, r.attribute.Parent, r)
47 |                 elif r.feature == 'CDS':
48 |                     cds = makedict(cds, r.attribute.Parent, r)
49 | 
50 |     best = OrderedDict()
51 |     for geneID, mrnas in gene.items():
52 |         cdslen = [(x, sum([c.length for c in cds[x.attribute.ID]])) for x in mrnas if cds.get(x.attribute.ID)]
53 |         mrnas = sorted(cdslen, key=lambda x:x[1], reverse=True)
54 |         #mrnas = sorted(mrnas, key=lambda x:x.length, reverse=True)
55 |         if not mrnas:
56 |             continue
57 |         best[geneID] = mrnas[0][0]
58 |     return best, cds
59 | 
60 | if __name__ == "__main__":
61 |     if len(sys.argv[1:]) != 1:
62 |         sys.stderr.write('usage: python {} ingff > outgff\n'.format(__file__))
63 |         sys.exit()
64 |     else:
65 |         infile = sys.argv[1]
66 |         best, cds = getbest(infile)
67 |         #print(len(best))
68 |         for geneID, mrna in best.items():
69 |             sys.stdout.write(str(mrna))
70 |             if cds.get(mrna.attribute.ID):
71 |                 children = cds[mrna.attribute.ID]
72 |                 for child in children:
73 |                     sys.stdout.write(str(child))
74 | 
75 | 


--------------------------------------------------------------------------------
/deal_fasta/fa2phy/fasta2phylip.pl:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/perl -w
  2 | 
  3 | ######################################################################################
  4 | # This script takes alignment sequence fasta file and converts it to phylip file
  5 | # Author: Wenjie Deng
  6 | # Date: 2007-01-29
  7 | # Usage: perl fasta2phylip.pl <inputFastaFile> <outputPhylipFile>
  8 | ######################################################################################
  9 | use strict;
 10 | 
 11 | my $usage = "Usage: perl fasta2phylip.pl <inputFastaFile> <outputPhilipFile>\n";
 12 | my $infile = shift or die($usage);	# input nexus file
 13 | my $outFile = shift or die($usage);	# output phylip file
 14 | my $unixFile = $infile.".unix";
 15 | 
 16 | ConvertToUnix ($infile, $unixFile);
 17 | ChangetoPhylip($unixFile, $outFile);
 18 | unlink ($unixFile);
 19 | print "All done!\n";
 20 | 
 21 | exit 0;
 22 | 
 23 | 
 24 | ######################################################################################
 25 | sub ConvertToUnix {
 26 | 	my ($infile, $unixFile) = @_;
 27 | 	open (IN, $infile) or die "Couldn't open $infile: $!\n";
 28 | 	open (OUT, ">$unixFile") or die "Couldn't open $unixFile: $!\n";
 29 | 	my @buffer = <IN>;
 30 | 	close IN;
 31 | 	my $line = "";
 32 | 	foreach my $element (@buffer) {
 33 | 		$line .= $element;
 34 | 	}
 35 | 	if ($line =~ /\r\n/) {
 36 | 		$line =~ s/\r//g;
 37 | 	}elsif ($line =~ /\r/) {
 38 | 		$line =~ s/\r/\n/g;
 39 | 	}
 40 | 	print OUT $line;
 41 | 	close OUT;
 42 | }
 43 | 
 44 | 
 45 | ######################################################################################
 46 | sub ChangetoPhylip {
 47 | 	my ($unixFile, $phylipFile) = @_;
 48 | 	my $seqCount = 0;
 49 | 	my $seq = my $seqName = "";
 50 | 	open IN, $unixFile or die "Couldn't open $unixFile\n";
 51 | 	while (my $line = <IN>) {
 52 | 		chomp $line;
 53 | 		next if $line =~ /^\s*$/;
 54 | 		if ($line =~ /^>/) {
 55 | 			$seqCount++;
 56 | 		}elsif ($seqCount == 1) {
 57 | 			$seq .= $line;
 58 | 		}
 59 | 	}
 60 | 	close IN;
 61 | 	my $seqLen = length $seq;
 62 | 
 63 | 	open(IN, $unixFile) || die "Can't open $unixFile\n";
 64 | 	open(OUT, ">$phylipFile") || die "Cant open $phylipFile\n";
 65 | 	print OUT $seqCount," ",$seqLen,"\n";
 66 | 	$seqCount = 0;
 67 | 	$seq = "";
 68 | 	while(my $line = <IN>) {
 69 | 		chomp $line;
 70 | 		next if($line =~ /^\s*$/);
 71 | 
 72 | 		if($line =~ /^>(\S+)/) {
 73 | 			if ($seqCount) {
 74 | 				my $len = length $seq;
 75 | 				if ($len == $seqLen) {
 76 | 					print OUT "$seqName\t$seq\n";
 77 | 					$seq = $seqName = "";
 78 | 				}else {
 79 | 					unlink $unixFile;
 80 | 					unlink $phylipFile;
 81 | 					die "Error: the sequence length of $seqName is not same as others.\n";
 82 | 				}
 83 | 			}
 84 | 			$seqName = $1;
 85 | 			$seqCount++;
 86 | 		}else {
 87 | 			$seq .= $line;
 88 | 		}
 89 | 	}
 90 | 	close IN;
 91 | # check the length of last sequence
 92 | 	my $len = length $seq;
 93 | 	if ($len == $seqLen) {
 94 | 		print OUT "$seqName\t$seq\n";
 95 | 	}else {
 96 | 		unlink $unixFile;
 97 | 		unlink $phylipFile;
 98 | 		die "Error: the sequence length of $seqName is not same as others.\n";
 99 | 	}
100 | 	close IN;
101 | 	close OUT;
102 | }
103 | 


--------------------------------------------------------------------------------
/genome/Anno_integrate/pasa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set +o posix
 3 | 
 4 | ### need copy and change *config to your work path !!!!!!!!!!!
 5 | 
 6 | ### export env
 7 | PASAPIPELINE=/01_soft/PASApipeline.v2.5.3
 8 | export PATH=${PASAPIPELINE}:${PASAPIPELINE}/scripts/:${PASAPIPELINE}/misc_utilities/:$PATH
 9 | unset PERL5LIB; export PATH=/softs/perl-5.30.2/bin:$PATH
10 | export PATH=/blast-2.2.26/bin/:$PATH                                                      ### blast
11 | export PATH=/gmap-2021-08-25/bin:$PATH                  ### gmap
12 | export PATH=${PASAPIPELINE}/bin:$PATH                                                               ### minimap2 pblat blat and other dependency softwares
13 | export PATH=/01_software/TransDecoder-TransDecoder-v5.5.0/:$PATH    ### TransDecoder
14 | export PATH=/01_soft/PASApipeline.v2.5.3/bin:$PATH
15 | 
16 | ### change parameter
17 | genome=HCS_chr.fa
18 | gff=HCS.EVM.bgi.filter.gff
19 | gff3=${gff}.gff3
20 | trans=trans.cdhit.rename.fa
21 | trans_clean=${trans}.clean
22 | cpu=73
23 | max_intron_legth=2000000
24 | config1=alignAssembly.config
25 | config2=annotCompare.config
26 | align_software=minimap2         ### gmap blat minimap2 pblat
27 | stringent_alignment_overlap=30  ### overlapping transcripts must have this min % overlap to be clustered.
28 | gene_overlap=50                 ### transcripts overlapping existing gene annotations are clustered.  Intergenic alignments are clustered by default mechanism.
29 | 
30 | ### step1: clean trans
31 | mkdir step1_clean
32 | cd step1_clean
33 | ln -s ../${trans} ./
34 | ${PASAPIPELINE}/bin/seqclean $trans -c 15 -v /00_tools/Clean-fasta/UniVec
35 | cd ../
36 | ln -s step1_clean/${trans}* ./
37 | 
38 | ### step2: align
39 | perl change_gff_format.pl ${gff} ${gff3}
40 | ${PASAPIPELINE}/bin/samtools faidx ./${trans}
41 | ${PASAPIPELINE}/Launch_PASA_pipeline.pl --config $config1 --annot_compare --ALT_SPLICE --create --replace --run --genome ./$genome --transcripts ./$trans_clean --ALIGNERS $align_software -T -u ./$trans --CPU $cpu --MAX_INTRON_LENGTH ${max_intron_legth} --TRANSDECODER --stringent_alignment_overlap ${stringent_alignment_overlap} --annots ${gff3}
42 | 
43 | ## step2: compare
44 | ${PASAPIPELINE}/scripts/build_comprehensive_transcriptome.dbi -c $config1 -t $trans_clean --min_per_ID 95 --min_per_aligned 30
45 | ## annot_compare_R1
46 | ${PASAPIPELINE}/Launch_PASA_pipeline.pl -c $config2 -g ${genome} -t $trans_clean -A -L --annots $gff3 --CPU $cpu
47 | ## annot_compare_R2 ### Attention here
48 | recent_update_file=$(ls -t *gene_structures_post_PASA_updates.*.gff3 | head -n 1)
49 | ${PASAPIPELINE}/Launch_PASA_pipeline.pl -c $config2 -g ${genome} -t $trans_clean -A -L --annots $recent_update_file --CPU $cpu
50 | 
51 | ## step3: alt_splice_analysis
52 | ${PASAPIPELINE}/Launch_PASA_pipeline.pl -c $config2 -g ${genome} -t $trans_clean --CPU $cpu --ALT_SPLICE
53 | 
54 | ## step4: find_orfs_in_pasa_assemblies ### Attention here
55 | DBname_assemblies_fasta=$(ls *assemblies.fasta)
56 | DBname_pasa_assemblies_gff3=$(ls *pasa_assemblies.gff3)
57 | ${PASAPIPELINE}/scripts/pasa_asmbls_to_training_set.dbi --pasa_transcripts_fasta $DBname_assemblies_fasta --pasa_transcripts_gff3 $DBname_pasa_assemblies_gff3
58 | 


--------------------------------------------------------------------------------
/transcriptome/Enrich/AnnotationForge_20250117.R:
--------------------------------------------------------------------------------
 1 | #!/01_software/miniconda3/envs/R-4.0/bin/Rscript
 2 | 
 3 | ### https://www.jianshu.com/p/45f1e8c9b79c
 4 | 
 5 | ### help doc
 6 | library(optparse)
 7 | option_list <- list(
 8 |     make_option(c("-v", "--info"), type = "character", default=F, metavar="info",
 9 |                 help="Building non-model object Orgdb packages.\n\t\t!!! Example: ./AnnotationForge.R -i eggNog.anno.txt -a LS -m shiyeyishang@outlook.com -g Cynoglossus -s se -d 244447 !!!"),
10 |     make_option(c("-i", "--input"), type = "character", default=NULL,
11 |                 help="annotation from eggNOG"),
12 |     make_option(c("-a", "--author"), type="character", default=NULL,
13 |                 help="author"),
14 |     make_option(c("-m", "--mail"), type="character", default=NULL,
15 |                 help="e-mail"),
16 |     make_option(c("-g", "--genus"), type="character", default=NULL,
17 |                 help="genus"),
18 |     make_option(c("-s", "--species"), type="character", default=NULL,
19 |                 help="species"),
20 |     make_option(c("-d", "--taxid"), type="character", default=NULL,
21 |                 help="Taxonomy ID from NCBI")
22 | )
23 | 
24 | opt_parser = OptionParser(option_list=option_list);
25 | opt = parse_args(opt_parser);
26 | 
27 | if (is.null(opt$input)){
28 |   print_help(opt_parser)}
29 | 
30 | infile = opt$input
31 | author_name = opt$author
32 | e_mail = opt$mail
33 | Genus = opt$genus
34 | Species = opt$species
35 | Taxid = opt$taxid
36 | 
37 | library(tidyverse)
38 | library(AnnotationForge)
39 | emapper <- read.delim(infile) %>%
40 | #  mutate(Description = if_else(Description != "-", Description, PFAMs)) %%
41 |   dplyr::select(GID = query, Gene_Symbol = Preferred_name,
42 |                 GO = GOs, KO = KEGG_ko, Pathway = KEGG_Pathway,
43 |                 OG = eggNOG_OGs, Gene_Name = Description, pfam = PFAMs)
44 | 
45 | gene_info <- dplyr::select(emapper,GID,Gene_Name) %>%
46 |   dplyr::filter(!is.na(Gene_Name))
47 | 
48 | gene2go <- dplyr::select(emapper,GID,GO) %>%
49 |   separate_rows(GO, sep = ",", convert = F) %>%
50 |   filter(GO!="NA",!is.na(GO)) %>%
51 |   mutate(EVIDENCE = 'A')
52 | 
53 | gene2ko<- dplyr::select(emapper,GID,KO) %>%
54 |   separate_rows(KO, sep = ",", convert = F) %>%
55 |   dplyr::filter(KO!="NA",!is.na(KO))
56 | 
57 | gene2pathway<- dplyr::select(emapper,GID,Pathway) %>%
58 | separate_rows(Pathway, sep = ",", convert = F) %>%
59 |   dplyr::filter(!is.na(Pathway))
60 | 
61 | gene2symbol<- dplyr::select(emapper,GID,Gene_Symbol) %>%
62 |   dplyr::filter(!is.na(Gene_Symbol))
63 | 
64 | AnnotationForge::makeOrgPackage(gene_info=gene_info,
65 |                                 go=gene2go,
66 |                                 ko=gene2ko,
67 |                                 pathway=gene2pathway,
68 |                                 symbol=gene2symbol,
69 |                                 maintainer=e_mail,
70 |                                 author=author_name,
71 |                                 version="0.1",
72 |                                 outputDir=".",
73 |                                 tax_id=Taxid,
74 |                                 genus=Genus,
75 |                                 species=Species,
76 |                                 goTable = "go")
77 | 


--------------------------------------------------------------------------------
/deal_gff/agp2gff/agp2gff.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from collections import defaultdict
 3 | 
 4 | def agp(agp_file):
 5 |     agp_dict = defaultdict(list)
 6 |     with open(agp_file) as f:
 7 |         for line_raw in f:
 8 |             line = line_raw.strip().split()
 9 |             if line_raw.startswith("#") or line[4] == "N":
10 |                 continue
11 |             hic_scaffold,hic_start,hic_end,*ignore,scaffold,sca_start,sca_end,strand=line
12 |             agp_dict[scaffold].append([[int(sca_start),int(sca_end)],hic_scaffold,int(hic_start),strand])
13 |     return agp_dict
14 | 
15 | def agp2gff(gff_file,agp_dict):
16 |     out_new_gff = open("new_gff.txt","w")
17 |     break_gene = open("break_gene.txt","w")
18 |     not_in_cprops = open("not_in_cprops.txt","w")
19 |     print('##gff-version 3', file=out_new_gff)
20 |     with open(gff_file) as f:
21 |         flag_gene = 1
22 |         for line_raw in f:
23 |             if line_raw.startswith('#'):
24 |                 continue
25 |             line = line_raw.strip().split('\t')
26 |             scaffold = line[0]
27 |             gene_start,gene_end = (int(line[3]),int(line[4]))
28 |             gene_strand = line[6]
29 |             gene_type = line[2].lower()
30 |             if scaffold in agp_dict:
31 |                 flag = 0
32 |                 for l in agp_dict[scaffold]:
33 |                     ScaInAgp_start = l[0][0]
34 |                     ScaInAgp_end = l[0][1]
35 |                     if gene_type == "mrna":
36 |                         flag_gene = 1
37 |                     if gene_start >= ScaInAgp_start and gene_end <= ScaInAgp_end and flag_gene == 1:
38 |                         _,hic_scaffold,hic_start,hic_strand = l
39 |                         line[0] = hic_scaffold
40 |                         line[6] = "+" if gene_strand == hic_strand else "-"
41 |                         #如果agp和gff中链的方向相同，则new.gff中应为正链；否则为负链
42 |                         if hic_strand == "-":
43 |                             gene_true_start = hic_start + ScaInAgp_end - gene_end
44 |                             gene_true_end = gene_true_start + gene_end - gene_start
45 |                         else:
46 |                             gene_true_start = hic_start + gene_start - ScaInAgp_start
47 |                             gene_true_end = gene_true_start + gene_end - gene_start
48 |                         #如果agp文件中为负链，则基因在HiC_scaffold中的起始和终止应正好反向互补，即scaffold的终止位置减去基因的终止位置加上hic的起始位置
49 |                         line[3] = str(gene_true_start)
50 |                         line[4] = str(gene_true_end)
51 |                         print("\t".join(line),file=out_new_gff)
52 |                         flag = 1
53 |                 if flag == 0:
54 |                     if gene_type == "mrna":
55 |                         flag_gene = 0
56 |                     print(scaffold,gene_start,gene_end,"is not in ",[l[0] for l in agp_dict[scaffold]],file=break_gene)
57 | #                    print("Error:","gene is not in",scaffold,str(gene_start),str(gene_end))
58 |             else:
59 |                 print(scaffold,"is not in cprops",file=not_in_cprops)
60 | #                print("Error:",scaffold,"is not in agp file")
61 | 
62 | if not sys.argv[1:]:
63 |     sys.stderr.write('Usage: {} agp gff\n'.format(__file__))
64 |     sys.exit()
65 | agp_dict=agp(sys.argv[1])
66 | agp2gff(sys.argv[2],agp_dict)
67 | 


--------------------------------------------------------------------------------
/deal_fasta/rename/rename.fa.py:
--------------------------------------------------------------------------------
 1 | # Usage
 2 | import argparse
 3 | from argparse import RawTextHelpFormatter
 4 | import csv
 5 | from Bio import SeqIO
 6 | from Bio.SeqRecord import SeqRecord
 7 | from io import StringIO
 8 | import os
 9 | import sys
10 | import re
11 | 
12 | # Functions
13 | # Log a message to stderr
14 | def msg(*args, **kwargs):
15 |     print(*args, file=sys.stderr, **kwargs)
16 | 
17 | # Log an error to stderr and quit with non-zero error code
18 | def err(*args, **kwargs):
19 |     msg(*args, **kwargs)
20 |     sys.exit(1);
21 | 
22 | # Check file exists
23 | def check_file(f):
24 |     return os.path.isfile(f)
25 | 
26 | # Check if file is in FASTA format
27 | def check_fasta(f):
28 |     if not os.path.isfile(f) or os.path.getsize(f) < 1:
29 |         return False
30 |     with open(f, 'r') as fasta:
31 |         if fasta.readline()[0] != '>':                      # Check if header starts with ">"
32 |             return False
33 |         for line in fasta:
34 |             line = line.strip()
35 |             if not line or line[0] == '>':
36 |                 continue
37 | #            if bool(re.search('[^ACTGactgNn?\-]', line)):   # Check if there are non-nucleotide characters in sequence
38 | #                return False
39 |     return True
40 | 
41 | def tab2dict(tab, sep):
42 |     dict = {}
43 |     with open(tab, mode='r') as file:
44 |         table = csv.reader(file, delimiter=sep)
45 |         for row in table:
46 |             dict[row[0]] = row[1]
47 |     return dict
48 | 
49 | parser = argparse.ArgumentParser(
50 |     formatter_class=RawTextHelpFormatter,
51 |     description='Rename headers/sequence IDs in multi-FASTA file\n',
52 |     usage='\n  %(prog)s [--tab new_names.txt] FASTA > new.fasta')
53 | parser.add_argument('fasta', metavar='FASTA', nargs=1, help='original FASTA file')
54 | parser.add_argument('--ids', metavar='FILE', required=True, nargs=1, help='specify tab-separated file with [oldnames] [newnames]')
55 | parser.add_argument('--out', metavar='FILE', nargs=1, help='specify output file (default = stdout)')
56 | parser.add_argument('--version', action='version', version='%(prog)s v0.1')
57 | args = parser.parse_args()
58 | 
59 | # Check input/output files
60 | if not check_file(args.fasta[0]):
61 |     err('ERROR: Cannot find "{}". Check file exists in the specified directory.'.format(args.fasta[0]))
62 | if not check_fasta(args.fasta[0]):
63 |     err('ERROR: Check "{}" is in FASTA format.'.format(args.fasta[0]))
64 | if not check_file(args.ids[0]):
65 |     err('ERROR: Cannot find "{}". Check file exists in the specified directory.'.format(args.ids[0]))
66 | if args.out:
67 |     if check_file(args.out[0]):
68 |         err('ERROR: "{}" already exists.'.format(args.out[0]))
69 | 
70 | # Rename leaf nodes
71 | newseqs = []
72 | new_names = tab2dict(args.ids[0], '\t')
73 | for record in SeqIO.parse(args.fasta[0], 'fasta'):
74 |     newid = new_names[record.id]
75 |     newseqs.append(SeqRecord(record.seq, id=newid, description=''))
76 | 
77 | # Write masked alignment to file or print to stdout
78 | if args.out:
79 |     msg('Masked sequences saved to "{}" ... '.format(args.out[0]))
80 |     SeqIO.write(newseqs, args.out[0], 'fasta')
81 | else:
82 |     seqFILE = StringIO()
83 |     SeqIO.write(newseqs, seqFILE, 'fasta')
84 |     output = seqFILE.getvalue().rstrip()
85 |     print(output)
86 | 
87 | sys.exit(0)
88 | 


--------------------------------------------------------------------------------
/deal_fasta/agp2fa/ragtag_agp2fa.py:
--------------------------------------------------------------------------------
 1 | #!/01_software/conda/envs/ragtag/bin/python3.6
 2 | 
 3 | """
 4 | MIT License
 5 | 
 6 | Copyright (c) 2021 Michael Alonge <malonge11@gmail.com>
 7 | 
 8 | Permission is hereby granted, free of charge, to any person obtaining a copy
 9 | of this software and associated documentation files (the "Software"), to deal
10 | in the Software without restriction, including without limitation the rights
11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | copies of the Software, and to permit persons to whom the Software is
13 | furnished to do so, subject to the following conditions:
14 | 
15 | The above copyright notice and this permission notice shall be included in all
16 | copies or substantial portions of the Software.
17 | 
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | SOFTWARE.
25 | """
26 | 
27 | import sys
28 | import argparse
29 | 
30 | import pysam
31 | 
32 | from ragtag_utilities.utilities import reverse_complement
33 | from ragtag_utilities.AGPFile import AGPFile
34 | 
35 | 
36 | def main():
37 |     parser = argparse.ArgumentParser(description="Build sequences in FASTA format from an AGP v2.1 file.", usage="ragtag.py agp2fa <scaffolds.agp> <components.fasta>")
38 |     parser.add_argument("agp", metavar="<scaffolds.agp>", nargs='?', default="", type=str, help="AGP v2.1 file")
39 |     parser.add_argument("components", metavar="<components.fasta>", nargs='?', default="", type=str, help="component FASTA file (can be uncompressed or bgzipped)")
40 | 
41 |     args = parser.parse_args()
42 |     if not args.agp or not args.components:
43 |         parser.print_help()
44 |         sys.exit()
45 | 
46 |     agp_file = args.agp
47 |     components_file = args.components
48 | 
49 |     fai = pysam.FastaFile(components_file)
50 |     agp = AGPFile(agp_file, mode="r")
51 | 
52 |     # Iterate over the lines of the AGP file
53 |     prev_obj = None
54 |     is_first = True
55 |     for agp_line in agp.iterate_lines():
56 |         if agp_line.obj != prev_obj:
57 |             if is_first:
58 |                 print(">" + agp_line.obj)
59 |                 is_first = False
60 |             else:
61 |                 print("\n>" + agp_line.obj)
62 | 
63 |             prev_obj = agp_line.obj
64 | 
65 |         if agp_line.is_gap:
66 |             sys.stdout.write("N"*agp_line.gap_len)
67 |         else:
68 |             if agp_line.orientation != "-":
69 |                 sys.stdout.write(str(fai.fetch(agp_line.comp, agp_line.comp_beg-1, agp_line.comp_end)))
70 | #                sys.stdout.write(reverse_complement(fai.fetch(agp_line.comp, agp_line.comp_beg-1, agp_line.comp_end)))
71 |             else:
72 |                 sys.stdout.write(reverse_complement(str(fai.fetch(agp_line.comp, agp_line.comp_beg-1, agp_line.comp_end))))
73 | #                sys.stdout.write(fai.fetch(agp_line.comp, agp_line.comp_beg-1, agp_line.comp_end))
74 | 
75 |     # End the FASTA file with a newline
76 |     sys.stdout.write("\n")
77 |     fai.close()
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     main()
82 | 


--------------------------------------------------------------------------------
/transcriptome/Enrich/eggnog-2.1.9.sh:
--------------------------------------------------------------------------------
 1 | ## GO and KEGG annotation using diamond by eggnog-mapper
 2 | ### 1. make XXX.wego to GO-enrich
 3 | ### 2. make XXX.KO to KEGG-enrich
 4 | ### 3. make gene symbol annotation file
 5 | ### Attention!!!  qsub vf>8G
 6 | 
 7 | # source /dellfsqd2/ST_OCEAN/USER/lishuo1/11_env/bashrc-17-1.txt
 8 | export PATH="/01_software/eggnog-mapper-2.1.9/eggnogmapper/bin:$PATH"
 9 | 
10 | query=$PWD/H_moli.bgi.gff.pep
11 | cpu=10
12 | out_name=eggnog-result
13 | output_dir=$PWD
14 | temp_dir=$PWD
15 | database=/01_software/eggnog-mapper-2.1.4-main_spec/data/
16 | main_script=/01_software/eggnog-mapper-2.1.9/emapper.py
17 | software=diamond   ## diamond,mmseqs,hmmer
18 | evalue=0.001
19 | sensmode=ultra-sensitive     ## for diamond: fast,mid-sensitive,sensitive,more-sensitive,very-sensitive,ultra-sensitive
20 | 
21 | /01_software/miniconda3/bin/python3 $main_script --cpu $cpu --data_dir $database -o $out_name --output_dir $output_dir --temp_dir $temp_dir --override -m $software -i $query --tax_scope auto --target_orthologs all --go_evidence all --pfam_realign none --report_orthologs --decorate_gff yes --evalue ${evalue} --scratch_dir $PWD --sensmode ${sensmode}
22 | 
23 | grep -v -e "#" -e "query" *.emapper.annotations | awk -F "\t" '{print $1"\t"$10}' | sed 's/"//g;s/-//g;s/,/\t/g' > anno.wego
24 | grep -v -e "#" *.emapper.annotations  | awk -F "\t" '{print $1"\t"$9"\t"$NF"\t"$8}' | sed 's/"//g' > simple.anno
25 | grep -v -e "#" -e "query" *.emapper.annotations | awk -F "\t" '{print $1"\t"$10}' | awk '$2!~/-/' | sed 's/"//g;s/,/;/g;s/\t/,/g' | /lishuo1/00_tools/csvtk unfold -H -f 2 -s ";" | sed 's/,/\t/g' > anno.unfold.wego
26 | grep -v -e "#" -e "query" *.emapper.annotations | awk -F "\t" '{print $1"\t"$12}' | awk '$2!~/-/' | sed 's/ko://1;s/,ko:/;/g;s/\t/,/g' | /lishuo1/00_tools/csvtk unfold -H -f 2 -s ";" | sed 's/,/\t/g' > anno.unfold.KO
27 | grep -v -e "#" -e "query" *.emapper.annotations | awk -F "\t" '{print $1"\t"$12}' | awk '$2!~/-/' | sed 's/ko://1;s/,ko:/,/g' > anno.fold.KO
28 | /01_software/miniconda3/bin/python /01_soft/kofam_scan-1.3.0/kofamscan_plus.py -K /01_soft/kofam_scan-1.3.0/ko00001.keg -i anno.unfold.KO -o kegg.all.xls
29 | awk -F "\t" '{print $1"\t"$4"\t"$5}' kegg.all.xls > gene2pathway.txt
30 | cut -f 1,3 kegg.all.xls | sed '1d;s/; /;/g' | awk -F "\\\[EC" '{print $1}' | awk '!a[$0]++' | /lishuo1/00_tools/csvtk -t fold -H -f 1 -v 2 -s " ||| " | awk -F "\t" 'NR==FNR{a[$1]=$2}NR!=FNR{print $0"\t"a[$1]}' - simple.anno | /lishuo1/00_tools/csvtk -t add-header -n ID,Symbol,Domain,Description,Symbol_KEGG > simple.anno.new
31 | rm simple.anno
32 | 
33 | ### 注释结果说明：eggnog-mapper会生成三个文件
34 | ### [project_name].emapper.hmm_hits:  记录每个用于搜索序列对应的所有的显著性的eggNOG Orthologous Groups(OG). 所有标记为"-"则表明该序列未找到可能的OG
35 | ### [project_name].emapper.seed_orthologs: 记录每个用于搜索序列对的的最佳的OG，也就是[project_name].emapper.hmm_hits里选择得分最高的结果。之后会从eggNOG中提取更精细的直系同源关系(orthology relationships)
36 | ### [project_name].emapper.annotations: 该文件提供了最终的注释结果。大部分需要的内容都可以通过写脚本从从提取，一共有13列。[project_name].emapper.annotations每一列对应的记录如下：
37 | ### query_name: 检索的基因名或者其他ID
38 | ### sedd_eggNOG_ortholog: eggNOG中最佳的蛋白匹配
39 | ### seed_orholog_evalue: 最佳匹配的e-value
40 | ### seed_ortolog_evalu: 最佳匹配的bit-score
41 | ### predicted_gene_name: 预测的基因名，特别指的是类似AP2有一定含义的基因名，而不是AT2G17950这类编号
42 | ### GO_term: 推测的GO的词条， 未必最新
43 | ### KEGG_KO: 推测的KEGG KO词条， 未必最新
44 | ### BiGG_Reactions: BiGG代谢反应的预测结果
45 | ### Annotation_tax_scope: 对该序列在分类范围的注释
46 | ### Matching_OGs: 匹配的eggNOG Orthologous Groups
47 | ### best_OG|evalue|score: 最佳匹配的OG(HMM模式才有)
48 | ### COG functional categories: 从最佳匹配的OG中推测出的COG功能分类
49 | ### eggNOG_HMM_model_annotation: 从最佳匹配的OG中推测出eggNOG功能描述
50 | 


--------------------------------------------------------------------------------
/transcriptome/Enrich/enrich.r:
--------------------------------------------------------------------------------
 1 | library(clusterProfiler)
 2 | library(org.Trubripes.eg.db)
 3 | library(ggplot2)
 4 | library(enrichplot)
 5 | library(stringr)
 6 | args <- commandArgs(T)
 7 | dir.create(args[1])
 8 | data <- read.table(args[2],header=F)
 9 | setwd(args[1])
10 | genes <- as.character(data$V1)
11 | ego <- enrichGO(gene          = genes, # list of entrez gene id
12 |                 OrgDb         = org.Trubripes.eg.db, # 背景使用分析物种的org包
13 |                 keyType       = 'GID',
14 |                 ont           = "ALL", #  "BP", "MF", "CC", "ALL"。GO三个子类。
15 |                 pAdjustMethod = "BH", # 多重假设检验，"holm", "hochberg", "hommel", "bonferroni", "BY", "fdr"
16 |                 pvalueCutoff  = 0.05, # 富集分析的pvalue，默认是pvalueCutoff = 0.05，更严格可选择0.01
17 |                 qvalueCutoff  = 0.2) # 富集分析显著性的qvalue，默认是qvalueCutoff = 0.2，更严格可选择0.05
18 | go.res <- data.frame(ego)
19 | 
20 | goBP <- subset(go.res,subset = (ONTOLOGY == "BP"))[1:15,]
21 | goCC <- subset(go.res,subset = (ONTOLOGY == "CC"))[1:10,]
22 | goMF <- subset(go.res,subset = (ONTOLOGY == "MF"))[1:10,]
23 | go.df <- rbind(goBP,goCC,goMF)
24 | 
25 | # 使画出的GO term的顺序与输入一致
26 | go.df$Description <- factor(go.df$Description,levels = rev(go.df$Description))
27 | go_bar <- ggplot(data = go.df, aes(x = Description, y = -log10(pvalue),fill = ONTOLOGY)) +
28 |                  geom_bar(stat = "identity",width = 0.9) + coord_flip() + theme_bw() +
29 |                  scale_x_discrete(labels = function(x) str_wrap(x,width = 50)) +
30 |                  labs(x = "GO terms",y = "-log10(pvalue)",title = "Barplot of Enriched GO Terms") +
31 |                  theme(axis.title = element_text(size = 13),axis.text = element_text(size = 11),plot.title = element_text(size = 14,hjust = 0.5,face = "bold"),legend.title = element_text(size = 13),legend.text = element_text(size = 11),plot.margin = unit(c(0.5,0.5,0.5,0.5),"cm"))
32 | 
33 | go_bar2 <- ggplot(data = go.df, aes(x = Description, y = Count,fill = ONTOLOGY)) +
34 |                  geom_bar(stat = "identity",width = 0.9)+
35 |                  coord_flip()+theme_bw()+ # 横纵坐标反转及去除背景色
36 |                  scale_x_discrete(labels = function(x) str_wrap(x,width = 50))+ # 设置term名称过长时换行
37 |                  labs(x = "GO terms",y = "Gene number",title = "Barplot of Enriched GO Terms")+ # 设置坐标轴标题及标题
38 |                  theme(axis.title = element_text(size = 13), # 坐标轴标题大小
39 |                  axis.text = element_text(size = 11), # 坐标轴标签大小
40 |                  plot.title = element_text(size = 14,hjust = 0.5,face = "bold"), # 标题设置
41 |                  legend.title = element_text(size = 13), # 图例标题大小
42 |                  legend.text = element_text(size = 11), # 图例标签大小
43 |                  plot.margin = unit(c(0.5,0.5,0.5,0.5),"cm")) # 图边距
44 | 
45 | pdf("GO_Barplot.pdf",width = 10,height = 10)
46 | go_bar
47 | go_bar2
48 | dev.off()
49 | write.table(go.df,"allGO_gene.xls",sep="\t",quote=F)
50 | 
51 | pdf("GO_treeplot.pdf",width=15,height=10)
52 | edox2 <- pairwise_termsim(ego) ###top30BP
53 | treeplot(edox2)
54 | dev.off()
55 | write.table(as.data.frame(edox2)[1:30,],"top30BP_gene.xls",sep="\t",quote=F)
56 | 
57 | kegg_anno <- read.table(args[3], sep="\t", header=T)
58 | kegg2gene <- kegg_anno[, c(2, 1)]
59 | kegg2name <- kegg_anno[, c(2, 3)]
60 | kegg <- enricher(genes, TERM2GENE = kegg2gene, TERM2NAME = kegg2name, pAdjustMethod = "BH", pvalueCutoff  = 0.05, qvalueCutoff  = 0.2)
61 | 
62 | pdf("KEGG_Barplot.pdf")
63 | barplot(kegg, showCategory=20, title="Enrichment_KEGG")
64 | dev.off()
65 | write.table(kegg[1:30,],"top30KEGG_gene.xls",sep="\t",quote=F)
66 | pdf("KEGG_treeplot.pdf",width=15,height=10)
67 | edox2 <- pairwise_termsim(kegg) ###top30BP
68 | treeplot(edox2)
69 | dev.off()
70 | 


--------------------------------------------------------------------------------
/python/msa_4d/readme.md:
--------------------------------------------------------------------------------
  1 | ## 主要功能
  2 | - 支持三种主流多序列比对工具：PRANK、MUSCLE和MAFFT
  3 | - 基于蛋白质序列引导的DNA密码子感知比对
  4 | - 自动提取4D兼并位点（四重简并位点）
  5 | - 多种重复物种处理策略：基于比对质量、最长序列、首个序列等
  6 | - 多种缺失数据处理策略：缺口填充、物种排除、基因排除
  7 | - 自动翻译CDS序列到蛋白质序列
  8 | - 支持TrimAl修剪比对结果
  9 | - 并行处理以提高效率
 10 | - 详细的日志和统计信息输出
 11 | 
 12 | ## 完整参数列表
 13 | - 必要参数:  
 14 |   --input_dir DIR          包含CDS文件的目录 (默认: ".")
 15 |    
 16 |   --output_dir DIR         输出文件目录 (默认: "./output")
 17 |     
 18 |   --aligner {prank,muscle,mafft}  使用的比对工具 (默认: "prank")   
 19 |   
 20 |   --prank_path PATH        PRANK可执行文件的绝对路径 (选择prank时必需)
 21 |   
 22 |   --muscle_path PATH       MUSCLE可执行文件的绝对路径 (选择muscle时必需)
 23 |   
 24 |   --mafft_path PATH        MAFFT可执行文件的绝对路径 (选择mafft时必需)   
 25 |   
 26 | - 常用选项:  
 27 |   --supergene_output FILE  超基因输出文件名 (默认: "supergene_4d.fasta")
 28 |   
 29 |   --threads N              并行处理的线程数 (默认: 4)
 30 |   
 31 |   --no_codon_aware         禁用密码子感知比对 (默认启用)
 32 |    
 33 |   --duplicate_strategy {longest,first,rename,alignment_quality}    处理重复物种的策略 (默认: alignment_quality)
 34 |    
 35 |   --skip_existing          如果比对文件已存在则跳过处理
 36 |   
 37 |   --min_coverage_pct N     物种必须存在的最低基因百分比 (默认: 50.0%)
 38 |   
 39 |   --log_level {DEBUG,INFO,WARNING,ERROR}
 40 |                            设置日志级别 (默认: INFO)   
 41 | 
 42 | - TrimAl相关选项:
 43 |   --use_trimal            使用TrimAl修剪蛋白质比对
 44 |   
 45 |   --trimal_path PATH      TrimAl可执行文件的绝对路径
 46 |   
 47 |   --trimal_automated      使用TrimAl自动化修剪方法 (默认: True)
 48 |   
 49 |   --gap_threshold N       TrimAl最小缺口阈值
 50 |   
 51 |   --consistency_threshold N  TrimAl一致性阈值  
 52 |   
 53 |   --conservation_threshold N   TrimAl保守性阈值
 54 |    
 55 |   --trim_supergene        对最终蛋白质超基因应用TrimAl   
 56 |   
 57 | - 高级选项:  
 58 |   --f N                   PRANK插入开放概率 (默认: 0.2)
 59 |   
 60 |   --gaprate N             PRANK缺口开放率
 61 |   
 62 |   --gapext N              PRANK缺口扩展概率
 63 |   
 64 |   --use_logs              在PRANK中使用对数计算(大数据集)
 65 |   
 66 |   --penalize_terminal_gaps   
 67 |                           在PRANK中正常惩罚末端缺口   
 68 | 
 69 |   --clean_temp            处理后清理临时文件 (默认: True)
 70 |   
 71 |   --create_protein_msa    创建蛋白质序列的多序列比对   
 72 | 
 73 | ## 主要结果文件说明
 74 | 4d_sites/supergene_4d_*.fasta: 4D兼并位点超基因，用于系统发育分析  
 75 |   
 76 | full_cds/supergene_full_*.fasta: 完整CDS序列的超基因  
 77 |   
 78 | proteins/supergene_protein_*.fasta: 翻译的蛋白质序列超基因   
 79 |    
 80 | stats/species_coverage_matrix_*.tsv: 物种/基因覆盖矩阵，显示每个物种在各基因中的存在情况   
 81 | 
 82 | ## 输出
 83 | output_dir/  
 84 | ├── 4d_sites/               # 4D位点序列  
 85 | │   ├── supergene_4d_gaps.fasta             # 用缺口填充策略的超基因  
 86 | │   ├── supergene_4d_exclude_species.fasta   # 排除物种策略的超基因  
 87 | │   └── supergene_4d_exclude_genes.fasta     # 排除基因策略的超基因  
 88 | ├── alignments/             # 各基因的比对结果  
 89 | │   ├── gene1.best.fas  
 90 | │   ├── gene2.best.fas  
 91 | │   └── ...  
 92 | ├── full_cds/               # 完整CDS序列的超基因   
 93 | │   ├── supergene_full_gaps.fasta   
 94 | │   ├── supergene_full_exclude_species.fasta  
 95 | │   └── supergene_full_exclude_genes.fasta   
 96 | ├── proteins/               # 翻译的蛋白质序列   
 97 | │   ├── supergene_protein_gaps.fasta   
 98 | │   ├── supergene_protein_exclude_species.fasta   
 99 | │   └── supergene_protein_exclude_genes.fasta   
100 | ├── protein_msa/            # 蛋白质多序列比对结果（如果使用--create_protein_msa,这个参数废弃，结果已经在proteins/ 中）  
101 | │   ├── gene1_protein_msa.fasta  
102 | │   ├── gene2_protein_msa.fasta  
103 | │   ├── supergene_protein_msa_gaps.fasta  
104 | │   └── ...  
105 | ├── stats/                  # 统计信息  
106 | │   ├── species_coverage_matrix_gaps.tsv  
107 | │   ├── species_coverage_matrix_exclude_species.tsv  
108 | │   └── species_coverage_matrix_exclude_genes.tsv  
109 | └── temp/                   # 临时文件（处理完成后可能被删除）   
110 | 


--------------------------------------------------------------------------------
/picture/GC_depth/depth_gc.r:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | library(grid)
 3 |  
 4 | #读取文件
 5 | depth_gc <- read.delim('Depth_GC.txt')
 6 | #seq start   end Depth   GC
 7 | #chr1    0   2000    4.6190000   0.491000
 8 | #chr1    2000    4000    15.8795004  0.502000
 9 | #chr1    4000    6000    3.4749999   0.501500
10 | #chr1    6000    8000    3.5050001   0.501500
11 |  
12 | #GC 含量中位数（百分比）
13 | depth_gc$GC <- 100 * depth_gc$GC
14 | GC_median <-  round(median(depth_gc$GC), 2)
15 |  
16 | #测序深度中位数
17 | depth_median <- round(median(depth_gc$Depth), 2)
18 |  
19 | #为了避免二代测序的 duplication 所致的深度极高值，将高于测序深度中位数 3 倍的数值去除
20 | depth_gc <- subset(depth_gc, Depth <= 3 * depth_median)
21 |  
22 | #depth 深度、GC 含量散点密度图
23 | depth_GC <- ggplot(depth_gc, aes(GC, Depth)) +
24 |     geom_point(color = 'gray', alpha = 0.6, pch = 19, size = 0.5) +
25 | #    geom_vline(xintercept = GC_median, color = 'red', lty = 2, lwd = 0.5) + 
26 | #    geom_hline(yintercept = depth_median, color = 'red', lty = 2, lwd = 0.5) +
27 |     stat_density_2d(aes(fill = ..density.., alpha = ..density..), geom = 'tile', contour = FALSE, n = 500) +
28 |     scale_fill_gradientn(colors = c('transparent', 'gray', 'blue', 'red')) +
29 |     theme(panel.grid.major = element_line(color = 'gray', linetype = 2, size = 0.25), panel.background = element_rect(color = 'black', fill = 'transparent')) +
30 |     labs(x = paste('GC % (Median :', GC_median, '%)'), y = paste('Depth (Median :', depth_median, 'X)')) +
31 |     theme(axis.text = element_text(size = 8), axis.title = element_text(size = 12)) +
32 |     theme(legend.position = 'none')
33 |  
34 | #depth 深度频数直方图
35 | depth_hist <- ggplot(depth_gc, aes(Depth)) +
36 |     geom_histogram(binwidth = (max(depth_gc$Depth) - min(depth_gc$Depth))/100, fill = 'gray', color = 'gray40', size = 0.1) +
37 |     geom_rug(color = 'gray', alpha = 0.6) +
38 |     theme(panel.grid.major = element_line(color = 'gray', linetype = 2, size = 0.25), panel.background = element_rect(color = 'black', fill = 'transparent')) +
39 |     theme(axis.line = element_line(color = 'black', size = 0.3), axis.text = element_text(size = 8), axis.title = element_text(size = 12)) +
40 |     labs(x = '', y = 'Numbers') +
41 |     coord_flip()
42 | #    geom_vline(xintercept = depth_median, color = 'red', lty = 2, lwd = 0.5)
43 |  
44 | #GC 含量频数直方图
45 | GC_hist <- ggplot(depth_gc, aes(GC)) +
46 |     geom_histogram(binwidth = (max(depth_gc$GC) - min(depth_gc$GC))/100, fill = 'gray', color = 'gray40', size = 0.1) +
47 |     geom_rug(color = 'gray', alpha = 0.6) +
48 |     theme(panel.grid.major = element_line(color = 'gray', linetype = 2, size = 0.25), panel.background = element_rect(color = 'black', fill = 'transparent')) +
49 |     theme(axis.line = element_line(color = 'black', size = 0.3), axis.text = element_text(size = 10), axis.title = element_text(size = 12)) +
50 |     labs(x = '', y = 'Numbers')
51 | #    geom_vline(xintercept = GC_median, color = 'red', lty = 2, lwd = 0.5)
52 |  
53 | #组合图片并输出
54 | #pdf('GC_Depth.pdf', width = 8, height = 8)
55 | #    grid.newpage()
56 | #    pushViewport(viewport(layout = grid.layout(3, 3)))
57 | #    print(depth_GC, vp = viewport(layout.pos.row = 2:3, layout.pos.col = 1:2))
58 | #    print(GC_hist, vp = viewport(layout.pos.row = 1, layout.pos.col = 1:2))
59 | #    print(depth_hist, vp = viewport(layout.pos.row = 2:3, layout.pos.col = 3))
60 | #dev.off()
61 |  
62 | png('GC_Depth.png', width = 4000, height = 4000, res = 600, units = 'px')
63 |     grid.newpage()
64 |     pushViewport(viewport(layout = grid.layout(3, 3)))
65 |     print(depth_GC, vp = viewport(layout.pos.row = 2:3, layout.pos.col = 1:2))
66 |     print(GC_hist, vp = viewport(layout.pos.row = 1, layout.pos.col = 1:2))
67 |     print(depth_hist, vp = viewport(layout.pos.row = 2:3, layout.pos.col = 3))
68 | dev.off()
69 | 


--------------------------------------------------------------------------------
/genome/Anno_EGAPx/deal_egapx.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set +o posix
 3 | 
 4 | species=C_nasu
 5 | egapx_gff=complete.genomic.gff
 6 | pep=complete.proteins.faa
 7 | cds=complete.cds.fna
 8 | 
 9 | ## deal feature
10 | [ -d stat_feature ] || mkdir stat_feature
11 | grep "pseudo=true" ${egapx_gff} | awk '$3=="pseudogene"' | cut -f 1,4,5,7,9 | sed 's/;/\t/g' | sed 's/ID=gene-//1;s/Name=//1;s/description=//1' | cut -f 1-5,7 > stat_feature/pseudogene.txt
12 | grep "gene_biotype=lncRNA" ${egapx_gff} | cut -f 1,4,5,7,9 | sed 's/;/\t/g' | cut -f 1-5 | sed 's/ID=gene-//g' > stat_feature/lncRNA.txt
13 | grep "gene_biotype=V_segment" ${egapx_gff} | cut -f 1,4,5,7,9 | sed 's/;/\t/g' | sed 's/ID=gene-//1;s/Name=//1;s/description=//1' | cut -f 1-5,7 > stat_feature/V_segment.txt
14 | grep "gene_biotype=C_region" ${egapx_gff} | cut -f 1,4,5,7,9 | sed 's/;/\t/g' | sed 's/ID=gene-//1;s/Name=//1;s/description=//1' | cut -f 1-5,7 > stat_feature/C_region.txt
15 | grep misc_RNA ${egapx_gff} | awk '$3=="transcript"' | cut -f 1,4,5,7,9 | sed 's/;/\t/g' | sed 's/ID=//1' > stat_feature/misc.txt
16 | cat stat_feature/*txt | cut -f 5 | awk '!a[$0]++' | sed '/^s*$/d' | grep -v -f - ${egapx_gff} | awk '$3=="gene"' | cut -f 1,4,5,7,9 | sed 's/;/\t/g' | awk -F "\t" '{if ($7~/description/) print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7; else if ($8~/description/) print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$7"\t"$8; else print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$5"\tuncharacterized protein" }' | sed 's/ID=gene-//g;s/Name=//g;s/description=//g' > stat_feature/protein.txt
17 | 
18 | ## get longest and contain UTR gff file
19 | cut -f 5 stat_feature/protein.txt | grep -f - ${egapx_gff} | grep -v -e "pseudo=true" -e "gene_biotype=lncRNA" -e "gene_biotype=V_segment" -e "gene_biotype=C_region" -e "gbkey=ncRNA" -e "misc_RNA" > pep.gff
20 | /usr/bin/singularity run --bind $PWD:$PWD /dellfsqd2/ST_OCEAN/USER/lishuo11/01_soft/singularity_all/Agata.sif agat_sp_keep_longest_isoform.pl -gff pep.gff -o ${species}.UTR.gff
21 | rm pep.gff pep.agat.log
22 | 
23 | ## get simple gff cds pep
24 | toBGI.py ${species}.UTR.gff > bgi.gff
25 | sed 's/|/\t/g' bgi.gff | awk '{print $NF"\t"$0}' | sed 's/-R/-P/1' | awk '{if ($4=="mRNA") print $2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\tID="$1; else print $2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\tParent="$1}' > bgi.rename.gff
26 | fix_mRNA_coordinate.pl bgi.rename.gff bgi.fix.gff
27 | fix_phase.py bgi.fix.gff
28 | nr_gff.pl --direction F mrna_bgi.fix.gff
29 | mv mrna_bgi.fix.gff.nr.gff ${species}.bgi.gff
30 | awk '{print $1}' ${pep} | sed 's/|/\t/g' | awk '{if ($1~/>/) print ">"$NF; else print $0}' > ${pep}.deal
31 | seqkit seq -w 0 ${cds} | sed 's/protein_id=/\t/g' | awk -F "\t" '{if ($1~/>/) print ">"$2; else print $0}' | sed 's/\]/\t/g' | awk '{print $1}' > ${cds}.deal
32 | awk '$3=="mRNA"' ${species}.bgi.gff | cut -f 9 | sed 's/ID=//g;s/;//g' | seqtk subseq ${pep}.deal - | awk '{print $1}' > ${species}.bgi.gff.pep
33 | awk '$3=="mRNA"' ${species}.bgi.gff | cut -f 9 | sed 's/ID=//g;s/;//g' | seqtk subseq ${cds}.deal - | awk '{print $1}' > ${species}.bgi.gff.cds
34 | rm bgi.gff bgi.gff bgi.fix.gff mrna_bgi.fix.gff mrna_bgi.fix.gff.cluster mrna_bgi.fix.gff.uncluster ${cds}.deal ${pep}.deal bgi.rename.gff
35 | 
36 | ## get anno from egapx
37 | grep ">" ${pep} | sed 's/|/\t/g' | cut -f 3- | sed 's/ /\t/1' > stat_feature/egapx.anno.txt
38 | diamond makedb --in ${pep} -d complete.proteins
39 | diamond blastp --db complete.proteins --query ${species}.bgi.gff.pep --out result.txt --outfmt 6 qseqid qstart qend sseqid sstart send qlen slen length pident evalue --more-sensitive --max-target-seqs 5 --evalue 1e-3 --id 50 --tmpdir ./ --threads 10
40 | awk '$10=="100"' result.txt | awk '!a[$1]++{print $0}' | awk '{print $4"\t"$1}' | sed 's/|/\t/g' | awk '{print $(NF-1)"\t"$NF}' | awk 'NR==FNR{a[$1]=$0}NR!=FNR{print $0"\t"a[$1]}' stat_feature/egapx.anno.txt - | cut -f 2,4 > egapx.anno.tsv
41 | rm result.txt complete.proteins.dmnd
42 | 


--------------------------------------------------------------------------------
/deal_fasta/fa2phy/fa2phy.v2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | FASTA到PHYLIP格式转换工具 - 性能优化版
  5 | 支持处理大型序列比对文件，Python 3版本
  6 | """
  7 | 
  8 | import sys
  9 | import os
 10 | import argparse
 11 | from datetime import datetime
 12 | 
 13 | 
 14 | # 简单用法信息
 15 | USAGE = """
 16 | FASTA转PHYLIP格式转换工具
 17 | 
 18 | 用法: fa2phy.py [input FASTA] [output PHYLIP]
 19 | 
 20 | 示例: fa2phy.py input.fasta output.phy
 21 | """
 22 | 
 23 | def msg(*args, **kwargs):
 24 |     """向stderr输出信息"""
 25 |     print(*args, file=sys.stderr, **kwargs)
 26 | 
 27 | def err(*args, **kwargs):
 28 |     """向stderr输出错误信息并退出"""
 29 |     msg("ERROR:", *args, **kwargs)
 30 |     sys.exit(1)
 31 | 
 32 | def parse_fasta_efficient(filename):
 33 |     """
 34 |     高效解析FASTA文件
 35 |     使用生成器避免将整个文件加载到内存
 36 |     """
 37 |     sequence_dict = {}
 38 |     sequence_list = []  # 保持序列顺序
 39 |     current_id = None
 40 | 
 41 |     try:
 42 |         with open(filename, 'r') as fh:
 43 |             seq_parts = []  # 收集序列片段
 44 | 
 45 |             for line in fh:
 46 |                 line = line.rstrip()
 47 |                 if not line:  # 跳过空行
 48 |                     continue
 49 | 
 50 |                 if line[0] == '>':
 51 |                     # 处理前一个序列
 52 |                     if current_id:
 53 |                         sequence_dict[current_id] = ''.join(seq_parts)
 54 | 
 55 |                     # 提取新序列ID
 56 |                     header = line[1:].strip()
 57 |                     current_id = header.split()[0]
 58 |                     sequence_list.append(current_id)
 59 |                     seq_parts = []  # 重置序列片段列表
 60 |                 else:
 61 |                     seq_parts.append(line)
 62 | 
 63 |             # 处理最后一个序列
 64 |             if current_id and seq_parts:
 65 |                 sequence_dict[current_id] = ''.join(seq_parts)
 66 | 
 67 |         return sequence_dict, sequence_list
 68 | 
 69 |     except Exception as e:
 70 |         err(f"解析FASTA文件时出错: {e}")
 71 | 
 72 | def write_phylip(sequence_dict, sequence_list, outfile):
 73 |     """
 74 |     高效写入PHYLIP格式文件
 75 |     """
 76 |     # 检查比对长度
 77 |     alignment_length = 0
 78 |     for gene in sequence_dict:
 79 |         if alignment_length == 0:
 80 |             alignment_length = len(sequence_dict[gene])
 81 |         elif len(sequence_dict[gene]) != alignment_length:
 82 |             err(f"比对长度错误: {gene}序列长度({len(sequence_dict[gene])})与其他序列长度({alignment_length})不一致")
 83 | 
 84 |     # 找出最长的序列ID
 85 |     if sequence_list:
 86 |         longest_id_len = max(len(id) for id in sequence_list)
 87 |     else:
 88 |         err("没有找到有效的序列")
 89 | 
 90 |     # 写入PHYLIP文件
 91 |     try:
 92 |         with open(outfile, "w") as phyfile:
 93 |             # 写入序列数量和比对长度
 94 |             phyfile.write(f"{len(sequence_dict)} {alignment_length}\n")
 95 | 
 96 |             # 写入序列
 97 |             for gene in sequence_list:
 98 |                 phyfile.write(f"{gene.ljust(longest_id_len)}   {sequence_dict[gene]}\n")
 99 | 
100 |         msg(f"成功写入PHYLIP文件: {outfile}")
101 |         msg(f"  序列数量: {len(sequence_dict)}")
102 |         msg(f"  比对长度: {alignment_length}")
103 | 
104 |     except Exception as e:
105 |         err(f"写入PHYLIP文件时出错: {e}")
106 | 
107 | def main():
108 |     """主函数"""
109 |     # 处理命令行参数
110 |     if len(sys.argv) != 3:
111 |         print(USAGE)
112 |         sys.exit(0)
113 | 
114 |     fasta_file = sys.argv[1]
115 |     phylip_file = sys.argv[2]
116 | 
117 |     # 验证输入文件
118 |     if not os.path.isfile(fasta_file):
119 |         err(f"找不到输入文件: {fasta_file}")
120 | 
121 |     if os.path.exists(phylip_file):
122 |         msg(f"警告: 输出文件 {phylip_file} 已存在，将被覆盖")
123 | 
124 |     # 解析FASTA文件
125 |     sequence_dict, sequence_list = parse_fasta_efficient(fasta_file)
126 | 
127 |     # 写入PHYLIP文件
128 |     write_phylip(sequence_dict, sequence_list, phylip_file)
129 | 
130 |     return 0
131 | 
132 | if __name__ == "__main__":
133 |     sys.exit(main())
134 | 


--------------------------------------------------------------------------------
/Comparative_genomics/Domain_predict/rpsblast.sh:
--------------------------------------------------------------------------------
 1 | pep=all.deal.pep
 2 | cpu=38
 3 | evalue=0.01
 4 | database_dir=/06_database/CDD_db
 5 | 
 6 | ### for i in {CdD,Tigr,Prk,Pfam,Kog,Cog,Cdd_NCBI,ncbi.cdd}; do grep -e "pep=" -e "cpu=" -e "evalue=" -e "database_dir=" -e ${i} rpsblast.sh > rpsblast.${i}.sh; done
 7 | 
 8 | rpsblast -query ${pep} -outfmt 6 -evalue ${evalue} -db ${database_dir}/Cdd  -out CdD_${pep}.txt -num_threads ${cpu}
 9 | rpsblast -query ${pep} -outfmt 6 -evalue ${evalue} -db ${database_dir}/Tigr  -out Tigr_${pep}.txt -num_threads ${cpu}
10 | rpsblast -query ${pep} -outfmt 6 -evalue ${evalue} -db ${database_dir}/Prk  -out Prk_${pep}.txt -num_threads ${cpu}
11 | rpsblast -query ${pep} -outfmt 6 -evalue ${evalue} -db ${database_dir}/Pfam  -out Pfam_${pep}.txt -num_threads ${cpu}
12 | rpsblast -query ${pep} -outfmt 6 -evalue ${evalue} -db ${database_dir}/Kog  -out Kog_${pep}.txt -num_threads ${cpu}
13 | rpsblast -query ${pep} -outfmt 6 -evalue ${evalue} -db ${database_dir}/Cog  -out Cog_${pep}.txt -num_threads ${cpu}
14 | rpsblast -query ${pep} -outfmt 6 -evalue ${evalue} -db ${database_dir}/Cdd_NCBI  -out Cdd_NCBI_${pep}.txt -num_threads ${cpu}
15 | rpsblast -query ${pep} -outfmt 6 -evalue ${evalue} -db ${database_dir}/ncbi.cdd  -out ncbi.cdd_${pep}.txt -num_threads ${cpu}
16 | 
17 | cat CdD_${pep}.txt | awk '{if ($7<$8) print $1"\t"$7"\t"$8"\t"$0; else print $1"\t"$8"\t"$7"\t"$0}' | sort -k 1,1 -k 2n,2 | bedtools cluster -i - | sort -k 16n,16 -k 15nr,15 | awk '!a[$16]++{print $0}' | csvtk cut -t -f 5,1,2,3 | sed 's/CDD://g' |  awk 'NR==FNR{a[$1]=$0}NR!=FNR{print $0"\t"a[$1]}' ${database_dir}/cddid_all.tbl - | cut -f 2-4,6- > Cdd.dom
18 | cat Tigr*txt | awk '{if ($7<$8) print $1"\t"$7"\t"$8"\t"$0; else print $1"\t"$8"\t"$7"\t"$0}' | sort -k 1,1 -k 2n,2 | bedtools cluster -i - | sort -k 16n,16 -k 15nr,15 | awk '!a[$16]++{print $0}' | csvtk cut -t -f 5,1,2,3 | sed 's/CDD://g' |  awk 'NR==FNR{a[$1]=$0}NR!=FNR{print $0"\t"a[$1]}' ${database_dir}/cddid_all.tbl - | cut -f 2-4,6- > Tigr.dom
19 | cat Prk*txt | awk '{if ($7<$8) print $1"\t"$7"\t"$8"\t"$0; else print $1"\t"$8"\t"$7"\t"$0}' | sort -k 1,1 -k 2n,2 | bedtools cluster -i - | sort -k 16n,16 -k 15nr,15 | awk '!a[$16]++{print $0}' | csvtk cut -t -f 5,1,2,3 | sed 's/CDD://g' |  awk 'NR==FNR{a[$1]=$0}NR!=FNR{print $0"\t"a[$1]}' ${database_dir}/cddid_all.tbl - | cut -f 2-4,6- > Prk.dom
20 | cat Pfam*txt | awk '{if ($7<$8) print $1"\t"$7"\t"$8"\t"$0; else print $1"\t"$8"\t"$7"\t"$0}' | sort -k 1,1 -k 2n,2 | bedtools cluster -i - | sort -k 16n,16 -k 15nr,15 | awk '!a[$16]++{print $0}' | csvtk cut -t -f 5,1,2,3 | sed 's/CDD://g' |  awk 'NR==FNR{a[$1]=$0}NR!=FNR{print $0"\t"a[$1]}' ${database_dir}/cddid_all.tbl - | cut -f 2-4,6- > Pfam.dom
21 | cat Kog*txt | awk '{if ($7<$8) print $1"\t"$7"\t"$8"\t"$0; else print $1"\t"$8"\t"$7"\t"$0}' | sort -k 1,1 -k 2n,2 | bedtools cluster -i - | sort -k 16n,16 -k 15nr,15 | awk '!a[$16]++{print $0}' | csvtk cut -t -f 5,1,2,3 | sed 's/CDD://g' |  awk 'NR==FNR{a[$1]=$0}NR!=FNR{print $0"\t"a[$1]}' ${database_dir}/cddid_all.tbl - | cut -f 2-4,6- > Kog.dom
22 | cat Cog*txt | awk '{if ($7<$8) print $1"\t"$7"\t"$8"\t"$0; else print $1"\t"$8"\t"$7"\t"$0}' | sort -k 1,1 -k 2n,2 | bedtools cluster -i - | sort -k 16n,16 -k 15nr,15 | awk '!a[$16]++{print $0}' | csvtk cut -t -f 5,1,2,3 | sed 's/CDD://g' |  awk 'NR==FNR{a[$1]=$0}NR!=FNR{print $0"\t"a[$1]}' ${database_dir}/cddid_all.tbl - | cut -f 2-4,6- > Cog.dom
23 | cat Cdd_NCBI*txt | awk '{if ($7<$8) print $1"\t"$7"\t"$8"\t"$0; else print $1"\t"$8"\t"$7"\t"$0}' | sort -k 1,1 -k 2n,2 | bedtools cluster -i - | sort -k 16n,16 -k 15nr,15 | awk '!a[$16]++{print $0}' | csvtk cut -t -f 5,1,2,3 | sed 's/CDD://g' |  awk 'NR==FNR{a[$1]=$0}NR!=FNR{print $0"\t"a[$1]}' ${database_dir}/cddid_all.tbl - | cut -f 2-4,6- > Cdd_NCBI.dom
24 | cat ncbi.cdd*txt | awk '{if ($7<$8) print $1"\t"$7"\t"$8"\t"$0; else print $1"\t"$8"\t"$7"\t"$0}' | sort -k 1,1 -k 2n,2 | bedtools cluster -i - | sort -k 16n,16 -k 15nr,15 | awk '!a[$16]++{print $0}' | csvtk cut -t -f 5,1,2,3 | sed 's/CDD://g' |  awk 'NR==FNR{a[$1]=$0}NR!=FNR{print $0"\t"a[$1]}' ${database_dir}/cddid_all.tbl - | cut -f 2-4,6- > ncbi.cdd.dom
25 | 


--------------------------------------------------------------------------------