├── python ├── msa_4d │ ├── test_data │ │ └── test │ └── readme.md └── zscore │ ├── zscore.py │ └── zscore2.py ├── deal_fasta ├── fa2phy │ ├── readme │ ├── fa2phy.py │ ├── fasta2phylip.pl │ └── fa2phy.v2.py ├── fasta_cut │ └── readme ├── agp2fa │ ├── readme │ ├── agp2fa.pl │ └── ragtag_agp2fa.py ├── split_fast_pilon │ └── README ├── filter_err_fasta │ ├── find_err_dna.py │ └── find_err_pep.py ├── six_frame_translate │ └── translate_seq.py └── rename │ └── rename.fa.py ├── deal_gff ├── fish │ └── readme ├── rm_overlap │ └── readme ├── gff.simple │ ├── readme │ ├── gff.simple.pl │ └── EVMtoBGI.py ├── find_overlap │ └── readme ├── gene_rename │ └── change.name.pl ├── pick_longest_gene │ ├── fix_mRNA_coordinate.pl │ ├── fix_phase.py │ ├── deal.sh │ ├── pick_longest_ncbi.pl │ └── toBGI.py └── agp2gff │ └── agp2gff.py ├── Comparative_genomics ├── kaks │ ├── blast.sh │ ├── collinearity_kaks.sh │ ├── genelist_kaks.sh │ └── go_kaks.sh ├── short_Peptide │ ├── readme.md │ └── short_Peptide_predict ├── deal_tree_nwk │ ├── step9.sh │ ├── step7.reserve_species.txt │ ├── step8.raw.tree.format_9.nwk │ ├── step5.all.leaves.txt │ ├── step8.raw.tree.format_6.nwk │ ├── step8.raw.tree.format_8.nwk │ ├── step1.sh │ ├── step6.raw.tree.delete_Homo.nwk │ ├── step8.raw.tree.format_4.nwk │ ├── raw.tree.deal.nwk │ ├── raw.tree.nwk │ ├── step2.sh │ ├── step5.sh │ ├── step8.raw.tree.format_0.nwk │ ├── step8.raw.tree.format_1.nwk │ ├── step8.raw.tree.format_2.nwk │ ├── step8.raw.tree.format_5.nwk │ ├── step8.raw.tree.format_7.nwk │ ├── raw.tree.reroot.Homo.nwk │ ├── step6.old_new.change.txt │ ├── step6.raw.tree.rename_Homo.nwk │ ├── step8.sh │ ├── step2.raw.tree.reroot.Homo.nwk │ ├── step3.raw.tree.sort.decreasing.nwk │ ├── step3.raw.tree.sort.increasing.nwk │ ├── step4.raw.tree.cladogram_transform.nwk │ ├── step8.raw.tree.format_3.nwk │ ├── step4.sh │ ├── step7.delete_Homo.txt │ ├── raw.tree.nwk.pdf │ ├── step1.txt │ ├── step2.txt │ ├── step3.decreasing.txt │ ├── step3.increasing.txt │ ├── step6.rename_Homo.txt │ ├── step7.sh │ ├── step6.sh │ ├── readme.md │ ├── step3.sh │ └── draw.r ├── get_gene_infomation │ ├── get.sh │ └── deal.sh ├── gene_family │ ├── pipline.png │ ├── 05_phylogenetic.sh │ ├── 01_blastp.sh │ ├── 02_hmm.sh │ ├── 04_final_gene_family.sh │ ├── 03_miniprot.sh │ └── 06_rna_seq.sh ├── Dotplot_two_genomes │ ├── hcs_fcs.paf.png │ └── dotplot.sh ├── Signal_peptide │ ├── Razor.sh │ └── DeepSig.sh ├── gene_family_cluster │ ├── sonicparanoid.sh │ ├── orthofinder.sh │ ├── sonicparanoid2.sh │ └── broccoli.sh ├── blast │ ├── blast-costom-outformat.sh │ ├── reciprocal_best_hits.sh │ └── diamond_rbh.R ├── gene_cluster │ └── Galeon.sh └── Domain_predict │ └── rpsblast.sh ├── genome ├── pseudogenes │ ├── readme.md │ ├── step2.sh │ └── step1.sh ├── Anno_RNA │ ├── minimap2 │ │ ├── readme.md │ │ ├── minimap2.sh │ │ ├── step3.sh │ │ └── step2.sh │ └── GMAP │ │ ├── index.sh │ │ └── map.sh ├── Hic │ ├── rfy_hic2 Pipeline.pdf │ ├── haphic │ │ ├── split_haphic_step3.sh │ │ ├── haphic_juicer_post.sh │ │ ├── split_haphic_step4.sh │ │ ├── split_haphic_step1.sh │ │ ├── split_haphic_step2.sh │ │ ├── bwa.sh │ │ ├── split_haphic_step0.sh │ │ ├── haphic.sh │ │ ├── re_draw.sh │ │ └── draw.sh │ ├── yahs │ │ ├── step2_juicer_post.sh │ │ └── step1_ass.sh │ └── all_hic.sh ├── assess │ ├── TeloExplorer.sh │ ├── omark.sh │ ├── busco-5.5.sh │ ├── CRAQ.sh │ ├── GCI_pb.sh │ ├── compleasm.sh │ └── LAI.sh ├── Anno_EGAPx │ ├── local2.yaml │ ├── egapx.03.1.sh │ ├── fix_mRNA_coordinate.pl │ ├── fix_phase.py │ ├── toBGI.py │ └── deal_egapx.sh ├── puerge_halp │ ├── kmerdup │ │ ├── readme.md │ │ ├── bowtie2_demo.sh │ │ ├── step4_refilter.sh │ │ ├── miniprot.sh │ │ ├── step3_filter.sh │ │ └── step1_prepare.sh │ ├── purge_haplotigs │ │ ├── step2_custom_set.sh │ │ └── step1_map.sh │ └── purge_dups.sh ├── TE │ ├── TEsorter.sh │ ├── NeuralTE.sh │ ├── DeepTE.sh │ └── HiTEv3.2.sh ├── Anno_homology │ ├── GeMoMa │ │ └── GeMoMa.sh │ ├── Spaln │ │ └── Spaln.sh │ ├── gth │ │ └── gth.sh │ └── miniprot │ │ └── miniprot.sh ├── quick_merge_genome │ └── quickmerge.sh ├── Mit_genome │ ├── mitoz.sh │ ├── fa2gb.py │ └── check_species_by_mit_pep.sh ├── Anno_integrate │ ├── alignAssembly.config │ ├── evm_auto.sh │ ├── annotCompare.config │ └── pasa.sh ├── Anno_EviAnn │ └── EviAnn.sh ├── Segmental_duplication │ ├── biser.sh │ └── biser_split.sh ├── Telomere │ └── Telomere_tidk.sh ├── Anno_denovo │ ├── galba.sh │ └── helixer.sh ├── evaluate_orf_cds │ └── evaluate_orf_cds.sh ├── noncoding │ └── noncoding_predict.sh ├── ragtag │ ├── ragtag.sh │ └── filter.pl ├── evaluate_genome_size │ └── evaluate_genome_size.sh ├── Genome_error_correction │ └── Pilon&racon.sh └── relernn │ └── All.prediction.sh ├── transcriptome ├── coding_potential_calculator │ └── readme.md ├── full_length_transcriptome │ └── flair_analyze_NCBI_SRA_full_length_transcriptome.sh └── Enrich │ ├── make_Orgdb.sh │ ├── AnnotationForge_20250117.R │ ├── eggnog-2.1.9.sh │ └── enrich.r ├── picture ├── heatscatter │ ├── example.tsv │ └── heatscatter.r ├── heatscatter2 │ ├── example.tsv │ └── heatscatter.r ├── DensityHeatmap │ ├── huoli.r │ └── huoli2.r ├── QQplot │ └── QQplot.R ├── line │ └── line.R ├── Manhattan │ ├── qqman_qq_mhd.r │ └── Manhattan.R ├── Normality.Test2 │ └── Normality.Test2.R ├── box2 │ └── box.R ├── genome_Circos │ ├── ticks.conf │ ├── circos.conf │ └── fast_Circos.sh ├── loess_fit │ └── loess_fit.R ├── syri_plotsv │ ├── base.demo.config │ └── syri_plotsv.sh ├── synteny_circos │ ├── simpletolink.py │ └── circos_sys.sh ├── box │ └── box.R └── GC_depth │ └── depth_gc.r ├── other ├── filter_fasta_non-ATCGN_characters.pl ├── outlier2.py ├── outlier.py ├── check_pid_info.sh └── count_directory_num_size.sh └── readme.md /python/msa_4d/test_data/test: -------------------------------------------------------------------------------- 1 | test 2 | -------------------------------------------------------------------------------- /deal_fasta/fa2phy/readme: -------------------------------------------------------------------------------- 1 | Thanks to Wenjie Deng 2 | -------------------------------------------------------------------------------- /deal_gff/fish/readme: -------------------------------------------------------------------------------- 1 | Thanks to fanw@genomics.org.cn 2 | -------------------------------------------------------------------------------- /deal_fasta/fasta_cut/readme: -------------------------------------------------------------------------------- 1 | Thanks to fanw@genomics.org.cn 2 | -------------------------------------------------------------------------------- /Comparative_genomics/kaks/blast.sh: -------------------------------------------------------------------------------- 1 | ../blast/blast-costom-outformat.sh -------------------------------------------------------------------------------- /Comparative_genomics/short_Peptide/readme.md: -------------------------------------------------------------------------------- 1 | GPU or CPU is ok 2 | -------------------------------------------------------------------------------- /deal_gff/rm_overlap/readme: -------------------------------------------------------------------------------- 1 | Thanks to lijianwen@genomics.org.cn 2 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step9.sh: -------------------------------------------------------------------------------- 1 | ./draw.r raw.tree.nwk raw.tree.nwk 2 | -------------------------------------------------------------------------------- /deal_gff/gff.simple/readme: -------------------------------------------------------------------------------- 1 | Thanks to hankai@genomics.cn 2 | Thanks to songyue@genomics.cn 3 | -------------------------------------------------------------------------------- /genome/pseudogenes/readme.md: -------------------------------------------------------------------------------- 1 | fork https://github.com/kelkar/Discover_pseudogenes and modify 2 | -------------------------------------------------------------------------------- /transcriptome/coding_potential_calculator/readme.md: -------------------------------------------------------------------------------- 1 | https://github.com/gao-lab/CPC2_standalone 2 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step7.reserve_species.txt: -------------------------------------------------------------------------------- 1 | Pongo 2 | Macaca 3 | Ateles 4 | Galago 5 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step8.raw.tree.format_9.nwk: -------------------------------------------------------------------------------- 1 | ((((Homo,Pongo),Macaca),Ateles),Galago); -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step5.all.leaves.txt: -------------------------------------------------------------------------------- 1 | Homo 2 | Pongo 3 | Macaca 4 | Ateles 5 | Galago 6 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step8.raw.tree.format_6.nwk: -------------------------------------------------------------------------------- 1 | ((((Homo,Pongo):0.28,Macaca):0.13,Ateles):0.38,Galago); -------------------------------------------------------------------------------- /genome/Anno_RNA/minimap2/readme.md: -------------------------------------------------------------------------------- 1 | sam2gff.pl from https://github.com/gpertea/gscripts/blob/master/sam2gff.pl 2 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step8.raw.tree.format_8.nwk: -------------------------------------------------------------------------------- 1 | ((((Homo,Pongo)NoName,Macaca)NoName,Ateles)NoName,Galago); -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step1.sh: -------------------------------------------------------------------------------- 1 | ## print the tree shape on terminal 2 | tree_deal.py -s raw.tree.nwk > step1.txt 3 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step6.raw.tree.delete_Homo.nwk: -------------------------------------------------------------------------------- 1 | (((Macaca:0.49,Pongo:0.21)1:0.13,Ateles:0.62)1:0.38,Galago:1); -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step8.raw.tree.format_4.nwk: -------------------------------------------------------------------------------- 1 | ((((Homo:0.21,Pongo:0.21),Macaca:0.49),Ateles:0.62),Galago:1); -------------------------------------------------------------------------------- /Comparative_genomics/get_gene_infomation/get.sh: -------------------------------------------------------------------------------- 1 | ## NCBI dataset 2 | datasets summary gene accession --report product ${i} 3 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/raw.tree.deal.nwk: -------------------------------------------------------------------------------- 1 | ((((Homo:0.21,Pongo:0.21)1:0.28,Macaca:0.49)1:0.13,Ateles:0.62)1:0.38,Galago:1); -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/raw.tree.nwk: -------------------------------------------------------------------------------- 1 | ((((Homo:0.21,Pongo:0.21):0.28,Macaca:0.49):0.13,Ateles:0.62):0.38,Galago:1.00); 2 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step2.sh: -------------------------------------------------------------------------------- 1 | ## reroot tree 2 | tree_deal.py -r -o Homo raw.tree.nwk > step2.raw.tree.reroot.Homo.nwk 3 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step5.sh: -------------------------------------------------------------------------------- 1 | ## get all tips of terminal leaves 2 | tree_deal.py -t raw.tree.nwk > step5.all.leaves.txt 3 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step8.raw.tree.format_0.nwk: -------------------------------------------------------------------------------- 1 | ((((Homo:0.21,Pongo:0.21)1:0.28,Macaca:0.49)1:0.13,Ateles:0.62)1:0.38,Galago:1); -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step8.raw.tree.format_1.nwk: -------------------------------------------------------------------------------- 1 | ((((Homo:0.21,Pongo:0.21):0.28,Macaca:0.49):0.13,Ateles:0.62):0.38,Galago:1); -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step8.raw.tree.format_2.nwk: -------------------------------------------------------------------------------- 1 | ((((Homo:0.21,Pongo:0.21)1:0.28,Macaca:0.49)1:0.13,Ateles:0.62)1:0.38,Galago:1); -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step8.raw.tree.format_5.nwk: -------------------------------------------------------------------------------- 1 | ((((Homo:0.21,Pongo:0.21):0.28,Macaca:0.49):0.13,Ateles:0.62):0.38,Galago:1); -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step8.raw.tree.format_7.nwk: -------------------------------------------------------------------------------- 1 | ((((Homo:0.21,Pongo:0.21)NoName,Macaca:0.49)NoName,Ateles:0.62)NoName,Galago:1); -------------------------------------------------------------------------------- /deal_gff/find_overlap/readme: -------------------------------------------------------------------------------- 1 | Thanks to fanw@genomics.org.cn 2 | Thanks to huangqf@genomics.org.cn 3 | Thanks to qiufeng@genomics.org.cn 4 | -------------------------------------------------------------------------------- /genome/Hic/rfy_hic2 Pipeline.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Biols0208/Self-use-bioinformatics-scripts/HEAD/genome/Hic/rfy_hic2 Pipeline.pdf -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/raw.tree.reroot.Homo.nwk: -------------------------------------------------------------------------------- 1 | (Homo:0.105,(Pongo:0.21,(Macaca:0.49,(Ateles:0.62,Galago:1.38)1:0.13)1:0.28)1:0.105); -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step6.old_new.change.txt: -------------------------------------------------------------------------------- 1 | Homo ren_1a 2 | Pongo Pongo 3 | Macaca Macaca 4 | Ateles Ateles 5 | Galago Galago 6 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step6.raw.tree.rename_Homo.nwk: -------------------------------------------------------------------------------- 1 | ((((ren_1a:0.21,Pongo:0.21)1:0.28,Macaca:0.49)1:0.13,Ateles:0.62)1:0.38,Galago:1); -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step8.sh: -------------------------------------------------------------------------------- 1 | for i in ${0..9}; do tree_deal.py --ouf ${i} raw.tree.nwk > step8.raw.tree.format_${i}.nwk; done 2 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step2.raw.tree.reroot.Homo.nwk: -------------------------------------------------------------------------------- 1 | (Homo:0.105,(Pongo:0.21,(Macaca:0.49,(Ateles:0.62,Galago:1.38)1:0.13)1:0.28)1:0.105); -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step3.raw.tree.sort.decreasing.nwk: -------------------------------------------------------------------------------- 1 | (Galago:1,(Ateles:0.62,(Macaca:0.49,(Homo:0.21,Pongo:0.21)1:0.28)1:0.13)1:0.38); -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step3.raw.tree.sort.increasing.nwk: -------------------------------------------------------------------------------- 1 | ((((Pongo:0.21,Homo:0.21)1:0.28,Macaca:0.49)1:0.13,Ateles:0.62)1:0.38,Galago:1); -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step4.raw.tree.cladogram_transform.nwk: -------------------------------------------------------------------------------- 1 | ((((Homo:0.21,Pongo:0.21)1:0.28,Macaca:0.49)1:0.13,Ateles:0.62)1:0.38,Galago:1); -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step8.raw.tree.format_3.nwk: -------------------------------------------------------------------------------- 1 | ((((Homo:0.21,Pongo:0.21)NoName:0.28,Macaca:0.49)NoName:0.13,Ateles:0.62)NoName:0.38,Galago:1); -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step4.sh: -------------------------------------------------------------------------------- 1 | ## transform braches by cladogram method 2 | tree_deal.py -c raw.tree.nwk > step4.raw.tree.cladogram_transform.nwk 3 | -------------------------------------------------------------------------------- /Comparative_genomics/gene_family/pipline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Biols0208/Self-use-bioinformatics-scripts/HEAD/Comparative_genomics/gene_family/pipline.png -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step7.delete_Homo.txt: -------------------------------------------------------------------------------- 1 | 2 | /-Macaca 3 | /-| 4 | /-| \-Pongo 5 | | | 6 | --| \-Ateles 7 | | 8 | \-Galago 9 | -------------------------------------------------------------------------------- /picture/heatscatter/example.tsv: -------------------------------------------------------------------------------- 1 | gene trans 2 | 1083 270 3 | 318 330 4 | 3573 3867 5 | 570 570 6 | 855 786 7 | 537 528 8 | 9540 9540 9 | 954 738 10 | 717 606 11 | -------------------------------------------------------------------------------- /picture/heatscatter2/example.tsv: -------------------------------------------------------------------------------- 1 | gene trans 2 | 1083 270 3 | 318 330 4 | 3573 3867 5 | 570 570 6 | 855 786 7 | 537 528 8 | 9540 9540 9 | 954 738 10 | 717 606 11 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/raw.tree.nwk.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Biols0208/Self-use-bioinformatics-scripts/HEAD/Comparative_genomics/deal_tree_nwk/raw.tree.nwk.pdf -------------------------------------------------------------------------------- /deal_fasta/agp2fa/readme: -------------------------------------------------------------------------------- 1 | Thanks to Sen Wang, wangsen1993@163.com 2 | https://github.com/malonge/RagTag 3 | https://mp.weixin.qq.com/s/QXDCZz88e6ubl4YgZKcfWQ 4 | -------------------------------------------------------------------------------- /Comparative_genomics/Dotplot_two_genomes/hcs_fcs.paf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Biols0208/Self-use-bioinformatics-scripts/HEAD/Comparative_genomics/Dotplot_two_genomes/hcs_fcs.paf.png -------------------------------------------------------------------------------- /genome/Anno_RNA/GMAP/index.sh: -------------------------------------------------------------------------------- 1 | mkdir reference 2 | 3 | reference=$PWD/reference 4 | species=HCS 5 | genome=HCS.fa 6 | 7 | /gmap/bin/gmap_build -D $reference -d ${species}_reference $genome 8 | -------------------------------------------------------------------------------- /genome/assess/TeloExplorer.sh: -------------------------------------------------------------------------------- 1 | 2 | genome=Juicer.FINAL.fa 3 | type_use=animal 4 | prefix=Telo 5 | 6 | quartet.py TeloExplorer -i ${genome} -c ${type_use} -p ${prefix} 7 | mv tmp ${prefix}_detail 8 | -------------------------------------------------------------------------------- /genome/Anno_EGAPx/local2.yaml: -------------------------------------------------------------------------------- 1 | genome: /test/data/genome.fa 2 | taxid: 34787 3 | annotation_provider: ABCD 4 | annotation_name_prefix: test 5 | locus_tag_prefix: test 6 | # proteins: /test/data/train.pep 7 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step1.txt: -------------------------------------------------------------------------------- 1 | 2 | /-Homo 3 | /-| 4 | /-| \-Pongo 5 | | | 6 | /-| \-Macaca 7 | | | 8 | --| \-Ateles 9 | | 10 | \-Galago 11 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step2.txt: -------------------------------------------------------------------------------- 1 | 2 | /-Homo 3 | --| 4 | | /-Pongo 5 | \-| 6 | | /-Macaca 7 | \-| 8 | | /-Ateles 9 | \-| 10 | \-Galago 11 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step3.decreasing.txt: -------------------------------------------------------------------------------- 1 | 2 | /-Galago 3 | --| 4 | | /-Ateles 5 | \-| 6 | | /-Macaca 7 | \-| 8 | | /-Homo 9 | \-| 10 | \-Pongo 11 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step3.increasing.txt: -------------------------------------------------------------------------------- 1 | 2 | /-Pongo 3 | /-| 4 | /-| \-Homo 5 | | | 6 | /-| \-Macaca 7 | | | 8 | --| \-Ateles 9 | | 10 | \-Galago 11 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step6.rename_Homo.txt: -------------------------------------------------------------------------------- 1 | 2 | /-ren_1a 3 | /-| 4 | /-| \-Pongo 5 | | | 6 | /-| \-Macaca 7 | | | 8 | --| \-Ateles 9 | | 10 | \-Galago 11 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step7.sh: -------------------------------------------------------------------------------- 1 | ## prune tree 2 | tree_deal.py --prune step7.reserve_species.txt raw.tree.nwk > step6.raw.tree.delete_Homo.nwk 3 | tree_deal.py -s step6.raw.tree.delete_Homo.nwk > step7.delete_Homo.txt 4 | -------------------------------------------------------------------------------- /picture/DensityHeatmap/huoli.r: -------------------------------------------------------------------------------- 1 | args <- commandArgs(T) 2 | 3 | library(LSD) 4 | pdf(paste(args[1], ".pdf", sep = "", collapse = "")) 5 | rawcount <- read.table(args[1], header = T, sep="\t") 6 | heatscatter(rawcount[,1],rawcount[,2]) 7 | -------------------------------------------------------------------------------- /picture/heatscatter/heatscatter.r: -------------------------------------------------------------------------------- 1 | library(LSD) 2 | args <- commandArgs (T) 3 | 4 | pdf(paste(args[1], ".pdf", sep = "", collapse = "")) 5 | rawcount <- read.table(args[1], header = T, sep="\t") 6 | heatscatter(rawcount[,1],rawcount[,2]) 7 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step6.sh: -------------------------------------------------------------------------------- 1 | ## rename leaves; TAB delimitor 2 | tree_deal.py --rename step6.old_new.change.txt raw.tree.nwk > step6.raw.tree.rename_Homo.nwk 3 | tree_deal.py -s step6.raw.tree.rename_Homo.nwk > step6.rename_Homo.txt 4 | -------------------------------------------------------------------------------- /genome/puerge_halp/kmerdup/readme.md: -------------------------------------------------------------------------------- 1 | 1. Use protein sequences of closely related species for rapid homology annotation to assist in determining blacklists and whitelists 2 | 2. Combined with Hi-C heat map and comprehensive judgment to remove duplicates 3 | -------------------------------------------------------------------------------- /genome/TE/TEsorter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source activate /01_soft/mamba/envs/TEsorter 3 | 4 | input=confident_TE.cons_valid.fa 5 | cpu=5 6 | 7 | TEsorter ${input} -p ${cpu} -rule 70-50-50 8 | mkdir TEsorter_out 9 | mv ${input}.rexdb* TEsorter_out 10 | -------------------------------------------------------------------------------- /deal_fasta/split_fast_pilon/README: -------------------------------------------------------------------------------- 1 | ## one split nead at least 5 process 2 | python3 pilon_pipeline.py \ 3 | --genome your_genome.fasta \ 4 | --reads1 reads_1.fq \ 5 | --reads2 reads_2.fq \ 6 | --parts N \ 7 | --fix all \ 8 | --output pilon_output 9 | -------------------------------------------------------------------------------- /genome/Hic/haphic/split_haphic_step3.sh: -------------------------------------------------------------------------------- 1 | ### https://github.com/zengxiaofei/HapHiC 2 | 3 | micromamba activate haphic 4 | 5 | genome=corrected_asm.fa 6 | cpu=80 7 | nchrs=23 8 | 9 | haphic sort ${genome} HT_links.pkl split_clms final_groups/group*.txt --processes ${cpu} 10 | -------------------------------------------------------------------------------- /Comparative_genomics/Dotplot_two_genomes/dotplot.sh: -------------------------------------------------------------------------------- 1 | minimap2 -t 40 -x asm5 hcs_chr-genome.fa fcs_chr-genome.fa > hcs_fcs.paf 2 | 3 | ### pafCoordsDotPlotly.R need Rpackage optparse ggplot2 plotly 4 | 5 | pafCoordsDotPlotly.R -i hcs_fcs.paf -o hcs_fcs.paf -m 1000 -q 1000 -s -t -l -p 16 6 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/readme.md: -------------------------------------------------------------------------------- 1 | 1. print the tree shape on terminal 2 | 2. reroot tree 3 | 3. sort the tree node 0 for decreasing 1 for increasing 4 | 4. transform braches by cladogram method 5 | 5. get all tips of terminal leaves 6 | 6. rename leaves; TAB delimitor 7 | 7. prune tree 8 | -------------------------------------------------------------------------------- /genome/Anno_homology/GeMoMa/GeMoMa.sh: -------------------------------------------------------------------------------- 1 | jar=GeMoMa-1.9.jar 2 | 3 | target_genome=ABCD.fa 4 | pep=all.pep.cdhit.fa 5 | threads=5 6 | out=GeMoMa 7 | 8 | java -Xms5G -Xmx10G -jar ${jar} CLI GeMoMaPipeline threads=${threads} AnnotationFinalizer.r=NO p=false o=true t=$target_genome outdir=$out s=pre-extracted c=$pep 9 | -------------------------------------------------------------------------------- /genome/Hic/haphic/haphic_juicer_post.sh: -------------------------------------------------------------------------------- 1 | edited_assembly=out_JBAT.review.assembly 2 | out_final_assembly_prefix=Juicer 3 | liftover_agp=out_JBAT.liftover.agp 4 | contig_genome=YZ.keep.fa 5 | 6 | /dellfsqd2/ST_OCEAN/USER/lichen2/00_software/yahs/juicer post -o ${out_final_assembly_prefix} ${edited_assembly} ${liftover_agp} ${contig_genome} 7 | -------------------------------------------------------------------------------- /genome/Hic/yahs/step2_juicer_post.sh: -------------------------------------------------------------------------------- 1 | edited_assembly=out_YZ.review.assembly 2 | out_final_assembly_prefix=out_YZ.review.final 3 | liftover_agp=out_YZ.liftover.agp 4 | contig_genome=YZ.asm.hic.p_ctg.fasta 5 | 6 | /dellfsqd2/ST_OCEAN/USER/lichen2/00_software/yahs/juicer post -o ${out_final_assembly_prefix} ${edited_assembly} ${liftover_agp} ${contig_genome} 7 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/step3.sh: -------------------------------------------------------------------------------- 1 | ## sort the tree node 2 | ## 0 for decreasing 1 for increasing 3 | 4 | tree_deal.py -l 0 raw.tree.nwk > step3.raw.tree.sort.decreasing.nwk 5 | tree_deal.py -l 1 raw.tree.nwk > step3.raw.tree.sort.increasing.nwk 6 | tree_deal.py -s step3.raw.tree.sort.increasing.nwk > step3.increasing.txt 7 | tree_deal.py -s step3.raw.tree.sort.decreasing.nwk > step3.decreasing.txt 8 | -------------------------------------------------------------------------------- /genome/TE/NeuralTE.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source activate /micromamba/envs/NeuralTE 3 | 4 | input=TIR.fa 5 | outdir=$PWD/out_NeuralTE 6 | threads_num=10 7 | 8 | python /01_soft/NeuralTE-master/src/Classifier.py \ 9 | --data ${input} \ 10 | --model_path /01_soft/NeuralTE-master/models/NeuralTE_model.h5 \ 11 | --outdir ${outdir} \ 12 | --use_gpu_num 0 \ 13 | --is_plant 0 \ 14 | --thread ${threads_num} 15 | -------------------------------------------------------------------------------- /picture/heatscatter2/heatscatter.r: -------------------------------------------------------------------------------- 1 | library(xbox) 2 | args <- commandArgs (T) 3 | 4 | rawcount <- read.table(args[1], header = T, sep="\t") 5 | pdf(paste(args[1], ".pdf", sep = "", collapse = "")) 6 | 7 | heatpoint(rawcount[,1],rawcount[,2]) -> dat_result 8 | str(dat_result) 9 | 10 | head(dat_result$plot.data) 11 | data.frame(dat_result$cor.result) 12 | str(dat_result$lm.result) 13 | xplot(dat_result) 14 | -------------------------------------------------------------------------------- /genome/quick_merge_genome/quickmerge.sh: -------------------------------------------------------------------------------- 1 | ## 这种merge方法更有利于对不同数据组装出来的基因组进行merge,以达到对所有的数据都利用起来的效果; 2 | ## 假如所有的组装方法都使用了相同的数据那么效果将不明显。 3 | 4 | ref_genome=$1 5 | qur_genome=$2 6 | threads=$3 7 | 8 | nucmer -t ${threads} -l 100 --mum -p nd ${ref_genome} ${qur_genome} 9 | delta-filter -r -q -l 10000 nd.delta > nd.rq.delta 10 | quickmerge -d nd.rq.delta -q ${qur_genome} -r ${ref_genome} -hco 5.0 -c 1.5 -l 1600000 -ml 10000 -p nd 11 | -------------------------------------------------------------------------------- /genome/Hic/haphic/split_haphic_step4.sh: -------------------------------------------------------------------------------- 1 | ### https://github.com/zengxiaofei/HapHiC 2 | 3 | micromamba activate haphic 4 | 5 | raw_genome=out_correct.fa 6 | haphic_corrected_genome=corrected_asm.fa 7 | cpu=50 8 | nchrs=23 9 | gap_size=500 10 | prefix=haphic 11 | 12 | haphic build ${haphic_corrected_genome} ${raw_genome} HiC.filtered.bam final_tours/group*.tour --corrected_ctgs corrected_ctgs.txt --Ns ${gap_size} --prefix ${prefix} 13 | -------------------------------------------------------------------------------- /Comparative_genomics/Signal_peptide/Razor.sh: -------------------------------------------------------------------------------- 1 | ## https://github.com/Gardner-BinfLab/Razor 2 | #micromamba activate py36 3 | 4 | source activate /home_micromamba/envs/py36 5 | 6 | input_pep=ABCD.pep 7 | output_txt=Razor 8 | cpu=10 9 | max_scan_length=80 10 | 11 | [ -d libs ] || ln -s /01_soft/Razor/libs/ 12 | 13 | python /01_soft/Razor/razor.py --fastafile ${input_pep} --output ${output_txt} --maxscan ${max_scan_length} --ncores ${cpu} 14 | 15 | rm libs 16 | -------------------------------------------------------------------------------- /genome/Mit_genome/mitoz.sh: -------------------------------------------------------------------------------- 1 | source activate /01_software/miniconda3/envs/mitozEnv 2 | 3 | mitoz all \ 4 | --outprefix qingyi \ 5 | --thread_number 20 \ 6 | --clade Chordata \ 7 | --species_name "Choerodon_schoenleinii" \ 8 | --workdir AUTOPL2111250159 \ 9 | --fq1 ABCD_1.clean.fq.gz \ 10 | --fq2 ABCD_2.clean.fq.gz \ 11 | --fastq_read_length 150 \ 12 | --data_size_for_mt_assembly 2 \ 13 | --assembler megahit \ 14 | --memory 50 \ 15 | --requiring_taxa Chordata 16 | -------------------------------------------------------------------------------- /deal_gff/gene_rename/change.name.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | @ARGV||die "Usage: perl $0 > rename.gff\n"; 4 | my ($file,$name)=@ARGV; 5 | open IN,shift; 6 | my $num='00000'; 7 | while(){ 8 | chomp; 9 | my @a=split; 10 | if($a[2] eq "mRNA"){ 11 | $num++; 12 | $a[8]="ID=$name$num;"; 13 | print join "\t",@a;print "\n"; 14 | } 15 | else{ 16 | $a[8]="Parent=$name$num;"; 17 | print join "\t",@a;print "\n"; 18 | } 19 | } 20 | close IN; 21 | -------------------------------------------------------------------------------- /Comparative_genomics/gene_family/05_phylogenetic.sh: -------------------------------------------------------------------------------- 1 | muscle -align ./1_identify_gene_family/04_final_gene_family/final_gene_protein -output ./2_phylogenetic/gene_protein.muscle 2 | trimal -in ./2_phylogenetic/gene_protein.muscle -out ./2_phylogenetic/gene_protein.muscle.trimal -automated1 3 | grep '^>' ./2_phylogenetic/gene_protein.muscle | sed 's/>//' > ./2_phylogenetic/gene_protein.muscle.name 4 | iqtree2 -s ./2_phylogenetic/gene_protein.muscle.trimal -m MFP -nt AUTO -B 1000 > ./2_phylogenetic/iqtree 5 | -------------------------------------------------------------------------------- /genome/puerge_halp/kmerdup/bowtie2_demo.sh: -------------------------------------------------------------------------------- 1 | export PATH="/01_soft/kmerDedup/:$PATH" 2 | 3 | prefix=YZ 4 | cpu=5 5 | work_dir=$PWD/.. 6 | input_dir=${work_dir}/split 7 | output_dir=${work_dir}/mapping 8 | 9 | [ -d ${output_dir} ] || mkdir ${output_dir} 10 | bowtie2 --very-sensitive -k 1000 --score-min L,-0.6,-0.2 --end-to-end --reorder -L 21 --rg-id ${prefix} --rg SM:${prefix} -p ${cpu} -f ${input_dir}/ABCD -x ${work_dir}/${prefix}.format | samtools view -@ ${cpu} -F 4 -bS - > ${output_dir}/ABCD.bam 11 | -------------------------------------------------------------------------------- /genome/Hic/haphic/split_haphic_step1.sh: -------------------------------------------------------------------------------- 1 | ### https://github.com/zengxiaofei/HapHiC 2 | 3 | micromamba activate haphic 4 | 5 | genome=out_correct.fa 6 | cpu=80 7 | nchrs=23 8 | inflation_step=0.1 9 | min_inflation=1 10 | max_inflation=1.5 11 | 12 | haphic cluster --remove_concentrated_links --remove_allelic_links 2 --threads ${cpu} --correct_nrounds 2 --correct_resolution 250 ${genome} HiC.filtered.bam ${nchrs} --inflation_step ${inflation_step} --min_inflation ${min_inflation} --max_inflation ${max_inflation} 13 | -------------------------------------------------------------------------------- /genome/Hic/haphic/split_haphic_step2.sh: -------------------------------------------------------------------------------- 1 | ### https://github.com/zengxiaofei/HapHiC 2 | 3 | micromamba activate haphic 4 | 5 | genome=corrected_asm.fa 6 | cpu=80 7 | nchrs=23 8 | best_inflation=1.5 9 | ambiguous_cutoff=0.4 10 | remove_allelic_links=2 11 | 12 | haphic reassign --ambiguous_cutoff ${ambiguous_cutoff} --remove_allelic_links ${remove_allelic_links} --nclusters ${nchrs} --threads ${cpu} ${genome} full_links.pkl inflation_${best_inflation}/mcl_inflation_${best_inflation}.clusters.txt paired_links.clm 13 | -------------------------------------------------------------------------------- /genome/puerge_halp/purge_haplotigs/step2_custom_set.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #https://blog.csdn.net/u012110870/article/details/100171429 3 | 4 | micromamba activate purge_haplotigs 5 | 6 | genome=YZ_CRAQ.fa 7 | input=hifi_aln_sorted.bam.200.gencov 8 | low_cut=0 9 | mid_cut=90 10 | high_cut=200 11 | cpu=70 12 | 13 | purge_haplotigs contigcov -i ${input} -o coverage_stats.csv -l ${low_cut} -m ${mid_cut} -h ${high_cut} 14 | purge_haplotigs purge -g ${genome} -c coverage_stats.csv -b hifi_aln_sorted.bam -t $cpu -a 60 -v -d 15 | -------------------------------------------------------------------------------- /picture/QQplot/QQplot.R: -------------------------------------------------------------------------------- 1 | library(CMplot) 2 | library(qqman) 3 | 4 | args <- commandArgs (T) 5 | 6 | results_log <- read.table(args[1], header=T) 7 | p_value=results_log$P 8 | z = qnorm(p_value/ 2) 9 | lambda = round(median(z^2, na.rm = TRUE) / 0.454, 3) 10 | lambda 11 | 12 | pdf(args[2], width = 6, height = 6) 13 | 14 | CMplot(results_log, plot.type = "q", threshold = 0.05, signal.cex=0.5, conf.int.col="grey", file="jpg", dpi=600, file.name=args[2], file.output=TRUE, verbose=F,cex=c(0.3,0.3)) 15 | 16 | dev.off() 17 | -------------------------------------------------------------------------------- /picture/line/line.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | args <- commandArgs (T) 3 | 4 | # 定义函数 5 | plot_line_chart <- function(input_file, output_file, pos, data_column) { 6 | # 读取数据 7 | data <- read.table(input_file, header = TRUE) 8 | 9 | # 绘制折线图 10 | pdf(output_file, width = 15, height = 6) 11 | 12 | ggplot(data, aes_string(x = pos, y = data_column)) + geom_line() + theme_minimal() 13 | } 14 | 15 | # 调用函数并绘制折线图 16 | plot_line_chart(input_file = args[1], output_file = args[2], pos = args[3], data_column = args[4]) 17 | -------------------------------------------------------------------------------- /genome/puerge_halp/purge_haplotigs/step1_map.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #source activate home_micromamba/envs/purge_haplotigs/ 4 | 5 | genome=YZ_CRAQ.fa 6 | ccs_fa=yz.fasta.gz 7 | cpu=70 8 | 9 | minimap2 -ax map-hifi -t $cpu $genome $ccs_fa --secondary=no -o hifi_aln.sam 10 | samtools faidx ${genome} 11 | samtools view -@ $cpu -t ${genome}.fai -bS hifi_aln.sam -o hifi_aln.bam 12 | samtools sort -t $cpu hifi_aln.bam > hifi_aln_sorted.bam 13 | rm hifi_aln.sam hifi_aln.bam 14 | 15 | purge_haplotigs readhist -b hifi_aln_sorted.bam -g $genome -t ${cpu} 16 | -------------------------------------------------------------------------------- /genome/Anno_homology/Spaln/Spaln.sh: -------------------------------------------------------------------------------- 1 | genome=ABCD.fa 2 | pep=all.pep.rmdup.fa 3 | out=spaln.gff 4 | cpu=10 5 | db=$PWD/seqdb/genome 6 | sif=/singularity_all/spaln3.sif 7 | 8 | [ -d seqdb ] || mkdir seqdb 9 | cp ${genome} seqdb/genome.gf 10 | 11 | cd seqdb 12 | /usr/bin/singularity run --bind $PWD/:$PWD/ ${sif} /spaln_data/bin/spaln -W -KP -g genome.gf 13 | cd ../ 14 | 15 | /usr/bin/singularity run --bind $PWD/:$PWD/ ${sif} /spaln_data/bin/spaln -Q7 -LS -pw -S3 -O0 -pi -yE 10 -yL 30 -t ${cpu} -D ${db} ${pep} > ${out} 2> Log.spaln 16 | 17 | rm -rf $PWD/seqdb 18 | -------------------------------------------------------------------------------- /picture/Manhattan/qqman_qq_mhd.r: -------------------------------------------------------------------------------- 1 | library(qqman) 2 | args <- commandArgs(T) 3 | 4 | results_log <- read.table(args[1], header=T) 5 | p_value=results_log$P 6 | z = qnorm(p_value/ 2) 7 | lambda = round(median(z^2, na.rm = TRUE) / 0.454, 3) 8 | lambda 9 | 10 | jpeg(paste(args[2], ".jpeg", sep = "", collapse = "")) 11 | 12 | qq(results_log$P, main = "Q-Q plot of GWAS p-values : log", xlim = c(0, 7), ylim = c(0, 12), pch = 18, col = "blue4", cex = 0.5, las = 1) 13 | manhattan(results_log,chr="CHR",bp="BP",p="P",snp="SNP", main = "Manhattan plot") 14 | 15 | dev.off() 16 | -------------------------------------------------------------------------------- /Comparative_genomics/gene_family_cluster/sonicparanoid.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source activate /micromamba/envs/sonicparanoid 4 | 5 | cwd=$PWD 6 | indir=./input 7 | outdir=$PWD/result 8 | cpu=35 9 | prefix=fish51 10 | inflation=2 11 | MIN_BITSCORE=100 12 | 13 | sonicparanoid -i $indir -o $outdir -p $prefix -t $cpu -m sensitive -I $inflation -op -bs $MIN_BITSCORE 14 | 15 | #mv $outdir/runs/$prefix/* $outdir 16 | #mv $outdir/ortholog_groups $outdir/$prefix 17 | 18 | ### remove tmp filei !!!!!!!!!!!!!!!!!!!!! 19 | #rm -rf $outdir/runs $outdir/orthologs_db $outdir/alignments 20 | -------------------------------------------------------------------------------- /genome/Mit_genome/fa2gb.py: -------------------------------------------------------------------------------- 1 | #!/01_software/miniconda3/bin/python3 2 | 3 | import sys 4 | import getopt 5 | sys.path.append('/01_software/miniconda3/lib/python3.7/site-packages/') 6 | from Bio import SeqIO 7 | 8 | input_handle = open(sys.argv[1], "r") 9 | output_handle = open(sys.argv[2], "w") 10 | 11 | sequences = list(SeqIO.parse(input_handle, "fasta")) 12 | 13 | # assign molecule type 14 | for seq in sequences: 15 | seq.annotations['molecule_type'] = 'DNA' 16 | 17 | count = SeqIO.write(sequences, output_handle, "genbank") 18 | 19 | output_handle.close() 20 | input_handle.close() 21 | -------------------------------------------------------------------------------- /genome/Hic/haphic/bwa.sh: -------------------------------------------------------------------------------- 1 | ### https://github.com/zengxiaofei/HapHiC 2 | 3 | micromamba activate haphic 4 | export PATH="/01_soft/HapHiC/:$PATH" 5 | export PATH="/01_soft/HapHiC/utils/:$PATH" 6 | 7 | genome=$PWD/chr.fa 8 | hic1=$PWD/hic_getreads.R1.fq.gz 9 | hic2=$PWD/hic_getreads.R2.fq.gz 10 | cpu=60 11 | 12 | bwa index ${genome} 13 | bwa mem -5SP -t ${cpu} ${genome} ${hic1} ${hic2} | samblaster | samtools view - -@ ${cpu} -S -h -b -F 3340 -o HiC.bam 14 | filter_bam HiC.bam 1 --nm 3 --threads ${cpu} | samtools view - -b -@ ${cpu} -o HiC.filtered.bam 15 | 16 | mock_agp_file.py ${genome} > ${genome}.agp 17 | -------------------------------------------------------------------------------- /Comparative_genomics/Signal_peptide/DeepSig.sh: -------------------------------------------------------------------------------- 1 | ### https://github.com/BolognaBiocomp/deepsig 2 | # micromamba activate DeepPeptide 3 | 4 | source activate /home_micromamba/envs/DeepPeptide 5 | export DEEPSIG_ROOT=/01_software/deepsig 6 | 7 | input_pep=FBG.correct.gff.pep 8 | output_txt=FBG.correct.gff.pep.deepsig 9 | organism=euk ## euk;gramp,gramn The organism the sequences belongs to. 10 | outfmt=gff3 ## json;gff3 11 | 12 | /usr/bin/singularity exec --bind $PWD/:$PWD/ /01_soft/singularity_all/deepsig.sif deepsig.py -f ${input_pep} -o ${output_txt} -k ${organism} -m ${outfmt} 13 | -------------------------------------------------------------------------------- /genome/Anno_integrate/alignAssembly.config: -------------------------------------------------------------------------------- 1 | 2 | ## templated variables to be replaced exist as <__var_name__> 3 | 4 | # database settings 5 | DATABASE=/workdir/new.db 6 | 7 | ####################################################### 8 | # Parameters to specify to specific scripts in pipeline 9 | # create a key = "script_name" + ":" + "parameter" 10 | # assign a value as done above. 11 | 12 | #script validate_alignments_in_db.dbi 13 | validate_alignments_in_db.dbi:--MIN_PERCENT_ALIGNED=80 14 | validate_alignments_in_db.dbi:--MIN_AVG_PER_ID=80 15 | 16 | #script subcluster_builder.dbi 17 | subcluster_builder.dbi:-m=50 18 | -------------------------------------------------------------------------------- /genome/assess/omark.sh: -------------------------------------------------------------------------------- 1 | #https://github.com/DessimozLab/OMArk 2 | ### Attention!!! ### 3 | ## before run OMArk, mkdir .etetoolkit in your home dir. 4 | 5 | input=EFGH 6 | outdir=omark_assess 7 | database=/06_database/OMArk_database/LUCA.h5 8 | 9 | [ -d ${outdir} ] || mkdir -p ${outdir} 10 | 11 | ## source activate /micromamba/envs/OMArk 12 | # omamer -h 13 | # omark -h 14 | 15 | /01_soft/mambaforge/bin/micromamba run -n OMArk omamer search --db ${database} --query ${input} --out ${outdir}/${input}.omamer 16 | /01_soft/mambaforge/bin/micromamba run -n OMArk omark -f ${outdir}/${input}.omamer -d ${database} -o ${outdir} 17 | -------------------------------------------------------------------------------- /genome/Anno_homology/gth/gth.sh: -------------------------------------------------------------------------------- 1 | genome=ABCD.fa 2 | pep=pep.fa 3 | outgff=GTH_predict.gff 4 | 5 | gth -intermediate -gff3out -genomic ${genome} -protein ${pep} > ${outgff} 6 | rm ${genome}.dna.* ${pep}.protein.* *md5 7 | 8 | awk '$3=="gene" || $3=="exon"' GTH_predict.gff | sed 's/exon/CDS/g;s/gene/mRNA/g' | awk -F ";" '{print $1";"}' > GTH_predict.bgi.gff 9 | grep -v "#" GTH_predict.bgi.gff | gffread -C -G -K -Q -Y -M --cset -d dup -H -V -P -N -Z - -g ${genome} -o gth.deal.gff 10 | awk '$3=="mRNA" || $3=="CDS"' gth.deal.gff | awk -F ";" '{print $1";"}' > gth.deal.bgi.gff 11 | Covert_for_evm.pl gth.deal.bgi.gff gth > gth.gff.forevm.gff3 12 | -------------------------------------------------------------------------------- /genome/Anno_EviAnn/EviAnn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | genome=Stichopus_variegatus.fa 4 | trans=transcripts.fa 5 | pep=proteins.faa 6 | cpu=15 7 | 8 | /01_software/EviAnn-2.0.2/bin/eviann.sh -t $cpu -g $genome -e $PWD/$trans -p $PWD/$pep --partial --debug -l 9 | 10 | [ -d out_final ] || mkdir out_final 11 | mv ${genome}.pseudo_label.gff out_final 12 | mv ${genome}.transcripts.fasta out_final 13 | mv ${genome}.proteins.fasta out_final 14 | cp *.sh.o* out_final/log.txt 15 | 16 | #rm ${genome}.* tissue* *stringtie*.sh *success broken* makeblastdb.out blastp2.out combine.out makeblastdb.sex2mex.out blastp5.out proteins.faa.uniq miniprot.err check_cds.out 17 | -------------------------------------------------------------------------------- /genome/Hic/haphic/split_haphic_step0.sh: -------------------------------------------------------------------------------- 1 | ### https://github.com/zengxiaofei/HapHiC 2 | 3 | micromamba activate haphic 4 | 5 | genome=merge.hap.fa 6 | cpu=70 7 | nchrs=23 8 | 9 | # HapHiC will ignore the parameter "nchrs", it can be any integer 10 | haphic pipeline ${genome} HiC.filtered.bam ${nchrs} --quick_view 11 | 12 | # Correct input contigs before a quick view 13 | #haphic pipeline ${genome} HiC.filtered.bam ${nchrs} --quick_view --correct_nrounds 2 14 | 15 | # Partition contigs into different haplotypes in quick view mode 16 | # haphic pipeline ${genome} HiC.filtered.bam ${nchrs} --quick_view --gfa "XXX.hap1.p_ctg.gfa,XXX.hap2.p_ctg.gfa" --correct_nrounds 2 17 | -------------------------------------------------------------------------------- /Comparative_genomics/short_Peptide/short_Peptide_predict: -------------------------------------------------------------------------------- 1 | source activate /home_micromamba/envs/DeepPeptide 2 | 3 | ### 4 | ln -s /01_soft/DeepPeptide/predictor/* ./ 5 | 6 | fasta=FBG.correct.gff.pep 7 | outdir=DeepPeptide 8 | batch_size=100 ## bigger batch_size, bigger memory 9 | 10 | mkdir ${outdir} 11 | 12 | python /01_soft/DeepPeptide/predictor/predict.py --fastafile ${fasta} --output_dir ${outdir} --esm esm2 --esm_pt /01_soft/DeepPeptide/predictor/checkpoints/esm2_t33_650M_UR50D.pt 13 | #python /01_soft/DeepPeptide/predictor/predict.py --fastafile ${fasta} --output_dir ${outdir} --esm esm1b --esm_pt /01_soft/DeepPeptide/predictor/checkpoints/esm1b_t33_650M_UR50S.pt 14 | -------------------------------------------------------------------------------- /genome/Segmental_duplication/biser.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set +o posix 3 | set -eo pipefail 4 | 5 | source activate /01_software/mamba/envs/biser 6 | 7 | genome=Strongylocentrotus_purpuratus.fa 8 | output=Strongylocentrotus_purpuratus 9 | cpu=1 10 | tempdir=$PWD/temp 11 | 12 | samtools faidx hardmask.fa 13 | biser --gc-heap 2G --hard --threads ${cpu} --output ${output}.SD.bed --keep-contigs --keep-temp --temp ${tempdir} hardmask.fa 14 | #biser --resume ${tempdir}/biser.XXXXXXXX(change here) --gc-heap 2G --hard --threads ${cpu} --output ${output} --keep-contigs --keep-temp --no-decomposition --temp ${tempdir} hardmask.fa 15 | rm hardmask.fa HiTE.gff HiTE.bed hardmask.fa.fai 16 | -------------------------------------------------------------------------------- /picture/Normality.Test2/Normality.Test2.R: -------------------------------------------------------------------------------- 1 | library("ggpubr") 2 | args <- commandArgs (T) 3 | 4 | indata <- read.table(args[1], header=T, sep="\t", quote="") 5 | 6 | # 定义函数 7 | Normality_test <- function(input_file, type) { 8 | 9 | indata <- read.table(input_file, header=T, sep="\t", quote="") 10 | 11 | #pdf(output_file, width = 7, height = 6) 12 | 13 | #ggdensity(indata, x= type, main = "Density") 14 | 15 | #ggqqplot(indata, x= type) ### color = group , palette = c("#00AFBB", "#E7B800")) 16 | 17 | expression_to_eval <- paste0("indata$", type) 18 | shapiro.test(eval(parse(text = expression_to_eval))) 19 | 20 | } 21 | 22 | Normality_test(input_file = args[1], type = args[2]) 23 | -------------------------------------------------------------------------------- /genome/Anno_RNA/minimap2/minimap2.sh: -------------------------------------------------------------------------------- 1 | 2 | genome=HCS_chr.fa 3 | trans=trans.fa 4 | cpu=40 5 | 6 | minimap2 -ax splice:hq -uf ${genome} ${trans} -t ${cpu} > aln.sam 7 | sam2gff.pl aln.sam > aln.sam.gff3 8 | awk '{print $0";"}' aln.sam.gff3 | sed 's/exon/CDS/g' > aln.sam.gff3.tmp 9 | 10 | ## 11 | export PERL5LIB=/01_software/TransDecoder-TransDecoder-v5.5.0/PerlLib:$PERL5LIB 12 | sed 's/exon/CDS/g' aln.sam.gff3 > tmp.gff 13 | gff2gtf_v2.pl tmp.gff aln.sam.gtf 14 | rm tmp.gff 15 | perl /01_software/TransDecoder-TransDecoder-v5.5.0/util/gtf_genome_to_cdna_fasta.pl aln.sam.gtf ${genome} > transcripts.fasta 16 | /01_software/TransDecoder-TransDecoder-v5.5.0/TransDecoder.LongOrfs -m 50 -t transcripts.fasta 17 | -------------------------------------------------------------------------------- /genome/Telomere/Telomere_tidk.sh: -------------------------------------------------------------------------------- 1 | genome=Stichopus_variegatus.fa 2 | out_dir=test1 3 | 4 | ## download tidk_database.csv to ~/.local/share/tidk 5 | #/software/miniconda3/bin/tidk build 6 | 7 | [ -d ${out_dir} ] || mkdir ${out_dir} 8 | 9 | ## explore 10 | /software/miniconda3/bin/tidk explore --distance 0.05 --minimum 5 --maximum 7 -t 30 ${genome} > ${out_dir}/candicate_TR_unit.tsv 11 | 12 | ## find 13 | for i in $(cat ${out_dir}/candicate_TR_unit.tsv | sed '1d' | awk '{print $1}'); do /software/miniconda3/bin/tidk search -s ${i} -o find_${i} -d ${out_dir} -w 50000 ${genome} ; done 14 | 15 | ## check 16 | for i in $(ls ${out_dir}/*_windows.tsv) ; do awk '$3>50 || $4>50' ${i} > ${i}.check.tsv ; done 17 | -------------------------------------------------------------------------------- /picture/Manhattan/Manhattan.R: -------------------------------------------------------------------------------- 1 | library(CMplot) 2 | args <- commandArgs (T) 3 | 4 | indata <- read.table(args[1], header=T, sep="\t", quote="") 5 | #pdf(args[2], width = 15, height = 6) 6 | 7 | CMplot(indata, plot.type="m", 8 | # col=c("grey30","grey60"), 9 | LOG10=T, 10 | ylim=c(0,10), 11 | cex=c(0.0001,0.0001), 12 | threshold=c(as.numeric(args[3]),as.numeric(args[4])), 13 | threshold.lty=c(1,2), threshold.lwd=c(1,1), threshold.col=c("black","grey"), 14 | amplify=T, chr.den.col=NULL, 15 | signal.col=c("red","green"), signal.cex=c(0.5,0.5), signal.pch=c(19,19), 16 | file="jpg", dpi=600, file.name=args[2], file.output=TRUE, verbose=F) 17 | -------------------------------------------------------------------------------- /Comparative_genomics/gene_family_cluster/orthofinder.sh: -------------------------------------------------------------------------------- 1 | cwd=$PWD 2 | indir=$cwd/00_data 3 | cpu=10 4 | prefix=Fish 5 | ### Options: blast, mmseqs, blast_gz, diamond 6 | software=diamond 7 | 8 | /usr/bin/singularity exec orthofinder.sif orthofinder -f $indir -t $cpu -a $cpu -S $software -n $prefix -p $cwd 9 | 10 | ## add species / remove species 11 | # /usr/bin/singularity exec orthofinder.sif orthofinder -f $indir -t $cpu -a $cpu -S $software -n $prefix -p $cwd --fewer-files -X 12 | 13 | ### remove tmp file !!!!!!!!!! 14 | #rm -rf $indir/Results_*/WorkingDirectory $indir/Results_*/Orthologues*/*_Trees $indir/Results_*/Orthologues*/WorkingDirectory 15 | 16 | #mv $indir/Results_* $cwd/orthofinder_${prefix} 17 | -------------------------------------------------------------------------------- /genome/Anno_denovo/galba.sh: -------------------------------------------------------------------------------- 1 | 2 | Species=HCS 3 | genome=HCS_chr.softmask.fa 4 | pep=train.pep 5 | thread=70 6 | 7 | /usr/bin/singularity exec -B $PWD/:$PWD/ galba.sif galba.pl --species=${Species} --genome=${genome} --prot_seq=${pep} --AUGUSTUS_CONFIG_PATH=/0_soft/augustus/Augustus/config --threads ${thread} 8 | 9 | gtf2gff.pl GALBA/galba.deal.gff3 11 | change.name.pl GALBA/galba.deal.gff3 GALBA_ > GALBA/galba.bgi.gff 12 | gffread GALBA/galba.bgi.gff -g ${genome} -x GALBA/galba.bgi.gff.cds -y GALBA/galba.bgi.gff.pep 13 | -------------------------------------------------------------------------------- /python/zscore/zscore.py: -------------------------------------------------------------------------------- 1 | ## usage: python zscore.py input output 2 | 3 | import sys 4 | import pandas 5 | from pandas import read_excel 6 | from sklearn import preprocessing 7 | 8 | input_file = sys.argv[1] 9 | output_file = sys.argv[2] 10 | 11 | dataset = pandas.read_csv(input_file, index_col=0) 12 | # dataframe to array 13 | values = dataset.values 14 | # define date type 15 | values = values.astype(float) 16 | # stat zscore 17 | data = preprocessing.scale(values) 18 | # array to datafarme 19 | df = pandas.DataFrame(data) 20 | # name columns 21 | df.columns = dataset.columns 22 | # name rows 23 | df.index = dataset.index 24 | # output file ps: three decimal places 25 | df.to_csv(output_file, float_format='%.3f', sep='\t') 26 | -------------------------------------------------------------------------------- /picture/box2/box.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs (T) 2 | library(rlang) 3 | library(ggstatsplot) 4 | 5 | indata <- read.table(args[1], header=T, sep="\t", quote="") 6 | pdf(args[2], width = 6, height = 6) 7 | 8 | ggbetweenstats( 9 | data = indata, 10 | x = !!sym(args[3]), 11 | y = !!sym(args[4]), 12 | plot.type = "boxviolin", ### "boxviolin" "box" "violin" 13 | p.adjust.method = "bonferroni", ### "bonferroni" "fdr" "BH" "hochberg" 14 | pairwise.comparisons = TRUE, ### "TRUE" "False" 15 | pairwise.display = "significant", ### "significant" "non-significant" "everything" 16 | type = "nonparametric" ### "nonparametric" "parametric" "robust" "bayes" 17 | ) 18 | -------------------------------------------------------------------------------- /Comparative_genomics/gene_family/01_blastp.sh: -------------------------------------------------------------------------------- 1 | gffread ./00_data/C_albu.bgi.gff -g ./00_data/C_albu.fa -x ./00_data/cds.fa 2 | seqkit translate --trim --clean ./00_data/cds.fa > ./00_data/pep.fa 3 | makeblastdb -in ./00_data/TLR_protein.fasta -dbtype prot -out blastdb 4 | blastp -query ./00_data/pep.fa -db blastdb -evalue 1e-05 -seg yes -outfmt '7 qseqid qstart qend sseqid sstart send qlen slen length pident evalue' -num_threads 5 > ./1_identify_gene_family/01_blastp/blastp.txt 5 | grep -v '#' ./1_identify_gene_family/01_blastp/blastp.txt |awk '$9/$8 > 0.8 || $9/$7 >0.8' > ./1_identify_gene_family/01_blastp/blastp.txt.filt 6 | awk '{print $1}' ./1_identify_gene_family/01_blastp/blastp.txt.filt | sort -u > ./1_identify_gene_family/01_blastp/blast.filt.id 7 | -------------------------------------------------------------------------------- /genome/evaluate_orf_cds/evaluate_orf_cds.sh: -------------------------------------------------------------------------------- 1 | input=helixer.bgi.gff.cds 2 | 3 | ## CPC2 4 | # https://github.com/gao-lab/CPC2_standalone 5 | /01_software/CPC/CPC2_standalone-1.0.1/bin/CPC2.py -i ${input} -o ${input}.CPC.txt 6 | 7 | ## PSAURON 8 | # https://github.com/salzberg-lab/PSAURON 9 | # Note: internal stop codons are ignored by PSAURON. A high PSAURON score does not guarantee a sequence contains a valid ORF. This is intended behavior, as alternate frame scores are used by default to boost the power of the model. 10 | export MAMBA_EXE='/01_soft/mamba/bin/micromamba' 11 | export MAMBA_ROOT_PREFIX='/home_micromamba' 12 | micromamba activate 13 | source activate /home_micromamba/envs/psauron 14 | 15 | psauron -i ${input} -o ${input}.PSAURON.csv 16 | -------------------------------------------------------------------------------- /genome/puerge_halp/kmerdup/step4_refilter.sh: -------------------------------------------------------------------------------- 1 | export PATH="/01_soft/kmerDedup/:$PATH" 2 | 3 | prefix=YZ 4 | mer_len=19 5 | work_dir=$PWD 6 | bam_dir=${work_dir}/mapping 7 | cpu=30 8 | out_dir=kmerdedup_refilter 9 | mpr=0.3 10 | whitelist=white.list ## whitelist to keep 11 | blacklist=black.list ## blacklist to remove 12 | 13 | ## -mpr max duplication percentage [0.3] 14 | ## -mcv min k-mer coverage(%) [30] 15 | ## -mode <1/2> 1:ratio only; 2:ratio * cov [2] 16 | 17 | perl /01_soft/kmerDedup/kmerDedup/kmerDedup.pl -k ${prefix} -mpr ${mpr} -mcv 30 -kmer ${mer_len} -o ${out_dir} -f ${prefix}.format.fa -dum ${prefix}.dump.hash ./ -cov ${prefix}.cov.stat -s samtools -t ${cpu} -mode 2 -wtl ${whitelist} -bll ${blacklist} 18 | -------------------------------------------------------------------------------- /genome/noncoding/noncoding_predict.sh: -------------------------------------------------------------------------------- 1 | genome=genome.fa 2 | threads=48 3 | 4 | ## miscRNA from egapx 5 | 6 | ## barrnap for rRNA 7 | /01_software/barrnap-0.9/bin/barrnap --kingdom euk --threads ${threads} ${genome} > rRNA_barrnap.gff 8 | 9 | ## aragorn for tRNA 10 | #/soft/aragorn -mt -a -t -m -i ${genome} -o tRNA.tsv 11 | 12 | ## tRNAscan_SE for tRNA 13 | export PATH=$PATH:/noncoding_soft/tRNAscan-SE-2.0/bin 14 | export PERL5LIB=/noncoding_soft/tRNAscan-SE-2.0/lib:$PERL5LIB 15 | export PATH=$PATH:/noncoding_soft/tRNAscan-SE-2.0 16 | 17 | tRNAscan_SE_config=/noncoding_soft/tRNAscan-SE-2.0/tRNAscan-SE.conf 18 | ## for vert vertebrate 19 | tRNAscan-SE -q -o tRNA.tsv -m statistics.summary -f tRNA_secondary.structures -M vert -c ${tRNAscan_SE_config} ${genome} 20 | -------------------------------------------------------------------------------- /transcriptome/full_length_transcriptome/flair_analyze_NCBI_SRA_full_length_transcriptome.sh: -------------------------------------------------------------------------------- 1 | ## https://flair.readthedocs.io/en/latest/other_ways.html 2 | 3 | genome=fcs.fa 4 | gtf=fcs.gtf 5 | fq=SRR17056084.fastq 6 | cpu=50 7 | 8 | 9 | ### module numbers: align=1, correct=2, collapse=3, collapse-range=3.5, quantify=4, diffExp=5, diffSplice=6 10 | flair align -r ${fq} -g ${genome} -t ${cpu} --junction_bed out_junction.bed 11 | 12 | flair correct -q flair.aligned.bed -g ${genome} --threads ${cpu} -f ${gtf} 13 | 14 | [ -d temp_flair ] || mkdir temp_flair 15 | flair collapse -r ${fq} -q flair_all_corrected.bed -g ${genome} -o flair.output --temp_dir temp_flair -t ${cpu} --keep_intermediate -f ${gtf} --no_gtf_end_adjustment --max_ends 5 --check_splice --generate_map 16 | -------------------------------------------------------------------------------- /genome/puerge_halp/kmerdup/miniprot.sh: -------------------------------------------------------------------------------- 1 | genome=curated.fasta 2 | pep=YZ.pep 3 | prefix=EFGH_ ### prefix for IDs in GFF3 4 | cpu=70 5 | max_intron_size=20k ### max intron size [200k] 6 | splice_model=1 ### splice model: 2=mammal, 1=general, 0=none (see Detail) [1] 7 | weight_of_splice_penalty=1 ### weight of splice penalty; 0 to ignore splice signals [1] 8 | 9 | miniprot -G ${max_intron_size} -j ${splice_model} -t ${cpu} --gff -P ${prefix} -C ${weight_of_splice_penalty} ${genome} ${pep} --outs=0.99 > EFGH.gff 10 | 11 | grep -A 1 "##PAF" EFGH.gff | awk '$1!~/--/' | paste - - | awk '($5-$4)/$3>0.8' | awk '{print $2"\t"$3"\t"$4"\t"$5"\t"$(NF-10)"\t"$(NF-6)"\t"$(NF-5)"\t"$(NF-4)"\t"$(NF-2)}' | sed 's/;/\t/g;s/ID=//g' > filter.info 12 | -------------------------------------------------------------------------------- /genome/ragtag/ragtag.sh: -------------------------------------------------------------------------------- 1 | source activate /01_software/conda/envs/ragtag/ 2 | 3 | # scaffold with multiple references/maps 4 | ragtag.py scaffold -t 48 -o out_1 chr.rename.fa polish_contig.fa 5 | ragtag.py scaffold -t 20 -o out_2 GCF_963930695.1_fLabBer1.1_genomic.fna polish_contig.fa 6 | ragtag.py scaffold -t 20 -o out_3 GCF_963584025.1_fLabMix1.1_genomic.fna polish_contig.fa 7 | ragtag.py scaffold -t 20 -o out_4 GCF_009762535.1_fNotCel1.pri_genomic.fna polish_contig.fa 8 | 9 | ragtag.py merge out_correct.fa out_*/*.agp -o Merge1 10 | ragtag.py merge out_correct.fa out_1/*.agp out_2/*.agp out_4/*.agp out_6/*.agp -o Merge2 11 | ragtag.py merge --gap-func max out_correct.fa out_1/*.agp out_2/*.agp out_4/*.agp out_6/*.agp -o Merge3 12 | 13 | ## remove other characters 14 | # perl filter.pl ragtag.scaffold.fasta > chr.fa 15 | -------------------------------------------------------------------------------- /picture/DensityHeatmap/huoli2.r: -------------------------------------------------------------------------------- 1 | args <- commandArgs(T) 2 | 3 | library(MASS) 4 | library(LSD) 5 | library(ggplot2) 6 | library(ggthemes) 7 | 8 | #pdf(paste(args[1], ".huoli2.pdf", sep = "", collapse = "")) 9 | png(paste(args[1], ".huoli2.png", sep = "", collapse = "")) 10 | 11 | DF <- read.table(args[1], header = F, sep="\t") 12 | 13 | x <- DF$V1 14 | y <- DF$V2 15 | dens <- kde2d(x,y) 16 | 17 | gr <- data.frame(with(dens, expand.grid(x,y)), as.vector(dens$z)) 18 | names(gr) <- c("xgr", "ygr", "zgr") 19 | 20 | mod <- loess(zgr~xgr*ygr, data=gr) 21 | 22 | DF$pointdens <- predict(mod, newdata=data.frame(xgr=x, ygr=y)) 23 | 24 | p <- ggplot(DF, aes(x=x,y=y, color=pointdens)) + theme_base() + scale_colour_gradientn(colours = colorpalette('heat', 5)) 25 | p <- p + geom_point() 26 | p <- p + ggtitle('heatscatter') 27 | p 28 | -------------------------------------------------------------------------------- /genome/TE/DeepTE.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source activate /01_soft/mamba/envs/DeepTE 3 | 4 | input_fasta=$PWD/unknow.fa 5 | species=M # P or M or F or O. P:Plants, M:Metazoans, F:Fungi, and O: Others. 6 | tmp_dir=DeepTE_TmpDir 7 | output_dir=DeepTE_OutDir 8 | Model_dir=/01_soft/DeepTE/Model/Metazoans_model # Metazoans_model Fungi_model Others_model UNS_model 9 | script=/01_soft/DeepTE/DeepTE.py 10 | probability_threshold=0.6 11 | 12 | mkdir ${tmp_dir} 13 | mkdir ${output_dir} 14 | 15 | python3 /01_soft/DeepTE/DeepTE_domain.py -d ${tmp_dir} -o ${output_dir} -i ${input_fasta} -s /01_soft/DeepTE/supfile_dir --hmmscan /01_soft/bin/hmmscan 16 | 17 | python3 ${script} -d ${tmp_dir} -o ${output_dir} -i ${input_fasta} -sp ${species} -m_dir ${Model_dir} -prop_thr ${probability_threshold} -modify ${output_dir}/opt_te_domain_pattern.txt 18 | -------------------------------------------------------------------------------- /genome/ragtag/filter.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | use Bio::SeqIO; 4 | 5 | ## --help 6 | # check input 7 | my $infile = $ARGV[0] or die "Usage: perl script.pl [output_file]\n"; 8 | my $outfile = $ARGV[1]; 9 | 10 | # open input 11 | my $in = Bio::SeqIO->new(-file => $infile, -format => 'Fasta'); 12 | 13 | # output check 14 | my $out; 15 | if ($outfile) { 16 | $out = Bio::SeqIO->new(-file => ">$outfile", -format => 'Fasta'); 17 | } else { 18 | $out = Bio::SeqIO->new(-fh => \*STDOUT, -format => 'Fasta'); 19 | } 20 | 21 | while (my $seq = $in->next_seq()) { 22 | my $sequence = $seq->seq; 23 | 24 | # remove non-ATCGN 25 | $sequence =~ s/[^ATCGNatcgn]//g; 26 | 27 | $seq->seq($sequence); 28 | $out->write_seq($seq); 29 | } 30 | 31 | $in->close(); 32 | $out->close() if $outfile; 33 | 34 | -------------------------------------------------------------------------------- /genome/puerge_halp/purge_dups.sh: -------------------------------------------------------------------------------- 1 | 利用purge_dups对基因组进行去冗余操作 2 | 1、软件安装 3 | ---------------------------------------------- 4 | git clone https://github.com/dfguan/purge_dups.git 5 | cd purge_dups/src && mak 6 | ----------------------------------------------- 7 | 2、运行脚本 8 | ----------------------------------------------- 9 | #第一步 10 | minimap2 -t 5 -xasm5 -DP assembly.fa pacbio.fa.gz | gzip -c - > pb_aln.paf.gz 11 | pbcstat pb_aln.paf.gz 12 | calcuts PB.stat > cutoffs 2> calcults.log 13 | #第二步 14 | split_fa assembly.fa > asm.split 15 | minimap2 -t 5 -xasm5 -DP asm.split asm.split | gzip -c - > assembly.fasta.split.self.paf.gz 16 | #第三步 17 | purge_dups 2 -T cutoffs -c PB.base.cov assembly.fasta.split.self.paf.gz > dups.bed 2> purge_dups.log 18 | #第四步 19 | get_seqs dups.bed assembly.fa 20 | ----------------------------------------------- 21 | -------------------------------------------------------------------------------- /genome/Segmental_duplication/biser_split.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set +o posix 3 | set -eo pipefail 4 | 5 | source activate /01_software/mamba/envs/biser 6 | 7 | genome=Strongylocentrotus_purpuratus.fa 8 | output=Strongylocentrotus_purpuratus 9 | cpu=2 10 | tempdir=$PWD/temp 11 | 12 | [ -d ${tempdir} ] || mkdir ${tempdir} 13 | 14 | ## scaffold to contig and mask 15 | bedtools maskfasta -fi ${genome} -fo mask.fa -bed HiTE.bed -mc "_" 16 | scaffold_to_contig.pl softmask.fa > mask_contig.fa 17 | scaffold_to_contig.pl -out contig_coor mask.fa > softmask_contig.fa.coor 18 | seqkit replace -p "_" -r "N" -s mask_contig.fa > hardmask.fa 19 | rm mask.fa mask_contig.fa 20 | 21 | samtools faidx hardmask.fa 22 | biser --gc-heap 2G --hard --threads ${cpu} --output ${output}.SD.bed --keep-contigs --keep-temp --temp ${tempdir} hardmask.fa 23 | rm hardmask.fa HiTE.gff HiTE.bed hardmask.fa.fai 24 | -------------------------------------------------------------------------------- /genome/Hic/yahs/step1_ass.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set +o posix 3 | 4 | genome=YZ_CRAQ.fa 5 | hic_fq1=YZ_clean.R1.fq.gz 6 | hic_fq2=YZ_clean.R2.fq.gz 7 | cpu=48 8 | 9 | ### chromap map 10 | samtools faidx ${genome} 11 | chromap -i -r ${genome} -o genome.index 12 | chromap --preset hic -r ${genome} -x genome.index --remove-pcr-duplicates -1 ${hic_fq1} -2 ${hic_fq2} --SAM -o aligned.sam -t ${cpu} 13 | samtools view -bh aligned.sam | samtools sort -@ ${cpu} -n > aligned.bam 14 | rm aligned.sam 15 | 16 | ### yahs 17 | /00_software/yahs/yahs ${genome} aligned.bam 18 | 19 | ### juicer 20 | /00_software/yahs/juicer pre -a -o out_Juicer yahs.out.bin yahs.out_scaffolds_final.agp ${genome}.fai > Log.txt 2>&1 21 | 22 | juicer=/00_software/juicer_tools_1.19.02.jar 23 | /01_soft/mambaforge/bin/java -Xmx36G -jar $juicer pre out_Juicer.txt out_Juicer.hic <(cat Log.txt | grep "PRE_C_SIZE" | awk '{print $2" "$3}') 24 | -------------------------------------------------------------------------------- /other/filter_fasta_non-ATCGN_characters.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | use Bio::SeqIO; 4 | 5 | # 检查是否提供了输入文件 6 | my $infile = $ARGV[0] or die "Usage: perl script.pl [output_file]\n"; 7 | my $outfile = $ARGV[1]; # 可选的输出文件 8 | 9 | # 打开输入文件 10 | my $in = Bio::SeqIO->new(-file => $infile, -format => 'Fasta'); 11 | 12 | # 如果提供了输出文件名则写入文件,否则写入标准输出 13 | my $out; 14 | if ($outfile) { 15 | $out = Bio::SeqIO->new(-file => ">$outfile", -format => 'Fasta'); 16 | } else { 17 | $out = Bio::SeqIO->new(-fh => \*STDOUT, -format => 'Fasta'); 18 | } 19 | 20 | # 遍历每一个序列 21 | while (my $seq = $in->next_seq()) { 22 | my $sequence = $seq->seq; 23 | 24 | # 去除非ATCGN的字符 25 | $sequence =~ s/[^ATCGNatcgn]//g; 26 | 27 | # 更新序列内容并写入文件或标准输出 28 | $seq->seq($sequence); 29 | $out->write_seq($seq); 30 | } 31 | 32 | # 关闭输入句柄 33 | $in->close(); 34 | $out->close() if $outfile; # 如果写入到文件,则关闭输出句柄 35 | -------------------------------------------------------------------------------- /deal_fasta/filter_err_fasta/find_err_dna.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | from Bio import SeqIO 4 | 5 | def check_dna_sequences(fasta_file): 6 | # Define legal DNA/RNA character sets 7 | valid_chars = set("ATCGatcg") 8 | 9 | with open(fasta_file, "r") as file: 10 | for record in SeqIO.parse(file, "fasta"): 11 | sequence = str(record.seq) 12 | invalid_chars = [char for char in sequence if char not in valid_chars] 13 | if invalid_chars: 14 | print(f">{record.id}") 15 | print(f"{sequence}") 16 | # print(f"Illegal_character: {set(invalid_chars)}") 17 | # print("-" * 40) 18 | 19 | if __name__ == "__main__": 20 | if len(sys.argv) != 2: 21 | print("Usage: python script.py ") 22 | sys.exit(1) 23 | 24 | fasta_file_path = sys.argv[1] 25 | check_dna_sequences(fasta_file_path) 26 | -------------------------------------------------------------------------------- /Comparative_genomics/blast/blast-costom-outformat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set +o posix 3 | set -eo pipefail 4 | #export BLAST_USAGE_REPORT=false 5 | 6 | if [[ $# == '0' ]]; then 7 | echo "usage: bidui target query fasta_type soft output_name cpu" 8 | echo "note: fasta_type: nucl; prot" 9 | echo "note: soft: blastn; blastp" 10 | echo "example: bidui db.fa test.fa nucl blastn result.txt 10" 11 | echo " bidui db.fa test.fa prot blastp result.txt 10" 12 | exit 1 13 | fi 14 | 15 | target=$1 16 | query=$2 17 | fasta_type=$3 18 | soft=$4 19 | output_name=$5 20 | cpu=$6 21 | soft_path=ncbi-blast-2.13.0+/bin 22 | 23 | ${soft_path}/makeblastdb -in ${target} -dbtype ${fasta_type} -out ./blastdb/${target} -parse_seqids 24 | ${soft_path}/${soft} -task ${soft} -db ./blastdb/${target} -query ${query} -out ${output_name} -outfmt '7 qseqid qstart qend sseqid sstart send qlen slen length pident evalue' -num_threads ${cpu} 25 | -------------------------------------------------------------------------------- /picture/genome_Circos/ticks.conf: -------------------------------------------------------------------------------- 1 | # 是否显示 ticks 2 | show_ticks = yes 3 | # 是否显示 ticks 的 lables 4 | show_tick_labels = yes 5 | ## 设定 ticks 6 | 7 | ## ticks 的设置 8 | # 设定 ticks 的位置 9 | radius = 1r 10 | # 设定 ticks 的颜色 11 | color = black 12 | # 设定 ticks 的厚度 13 | thickness = 2p 14 | # 设定 ticks' label 的值的计算。将该刻度对应位置的值 * multiplier 得到能展示到圈图上的 label 值。 15 | multiplier = 1e-6 16 | # label 值的格式化方法。%d 表示结果为整数;%f 结果为浮点数; %.1f 结果为小数点后保留1位; %.2f 结果为小数点后保留2位。 17 | format = %d 18 | ## 以下设置了 2 个 ticks,前者是小刻度,后者是大刻度。 19 | 20 | # 设置每个刻度代表的长度。若其单位为u,则必须要设置chromosomes_units参数。比如设置chromosomes_units=1000000,则如下5u表示每个刻度代表5M长度 21 | spacing = 1u 22 | # 设置 tick 的长度 23 | size = 5p 24 | 25 | 26 | spacing = 5u 27 | size = 15p 28 | # 由于设置的是大刻度,以下用于设置展示 ticks' label。 29 | show_label = yes 30 | # 设置 ticks' label 的字体大小 31 | label_size = 20p 32 | # 设置 ticks' label 离 ticks 的距离 33 | label_offset = 10p 34 | format = %d 35 | 36 | 37 | -------------------------------------------------------------------------------- /deal_fasta/filter_err_fasta/find_err_pep.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | from Bio import SeqIO 4 | 5 | def check_protein_sequences(fasta_file): 6 | # Define legal protein character sets 7 | valid_chars = set("ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy") 8 | 9 | with open(fasta_file, "r") as file: 10 | for record in SeqIO.parse(file, "fasta"): 11 | sequence = str(record.seq) 12 | invalid_chars = [char for char in sequence if char not in valid_chars] 13 | if invalid_chars: 14 | print(f">{record.id}") 15 | print(f"{sequence}") 16 | # print(f"Illegal_character: {set(invalid_chars)}") 17 | # print("-" * 40) 18 | 19 | if __name__ == "__main__": 20 | if len(sys.argv) != 2: 21 | print("Usage: python script.py ") 22 | sys.exit(1) 23 | 24 | fasta_file_path = sys.argv[1] 25 | check_protein_sequences(fasta_file_path) 26 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | The script sources are complex, and we thank the authors of some scripts in the corresponding directories. 2 | If there are any omissions, please inform us in the issue. 3 | 4 | The main creators include: 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 |
中文名NameE-mailBlog / Other
李硕Biolsshiyeyishang@outlook.comhttps://bioinformls.com https://www.researchgate.net/profile/Shuo_Li37
十七岁天菜少年18137879861@163.com
韩圣磊hanshenglei17860712133@163.com
越来越好better
44 | 45 | -------------------------------------------------------------------------------- /picture/loess_fit/loess_fit.R: -------------------------------------------------------------------------------- 1 | # 加载必要的包 2 | library(ggplot2) 3 | 4 | args <- commandArgs(T) 5 | 6 | # 定义读取数据文件的路径 7 | input_file_path <- args[1] 8 | output_file_path <- args[2] 9 | 10 | # 读取数据文件 11 | data <- read.table(input_file_path, header = FALSE, col.names = c("Dist", "Mean_r")) 12 | 13 | # 使用 LOESS 方法进行局部加权多项式回归 14 | loess_fit <- loess(Mean_r ~ Dist, data = data, span = 0.75) 15 | 16 | # 创建预测值 17 | data$Loess_Fit <- predict(loess_fit, newdata = data$Dist) 18 | 19 | # 将 LOESS 拟合结果保存到 CSV 文件 20 | write.csv(data, output_file_path, row.names = FALSE) 21 | 22 | # 绘制图表 23 | plot <- ggplot(data, aes(x = Dist, y = Mean_r)) + 24 | geom_point(size = 1) + # 原始数据点 25 | geom_line(aes(y = Loess_Fit), color = 'blue', linewidth = 1) + # LOESS 平滑曲线 26 | labs(title = "LD Decay with LOESS", 27 | x = "Distance", 28 | y = expression(Mean~r)) + 29 | theme_minimal() 30 | 31 | # 保存图表为 PNG 文件 32 | ggsave(args[3], plot = plot, width = 10, height = 6) 33 | -------------------------------------------------------------------------------- /genome/Mit_genome/check_species_by_mit_pep.sh: -------------------------------------------------------------------------------- 1 | input_mit_pep=test.pep 2 | cpu=48 3 | evalue=0.0001 4 | matrix=BLOSUM62 5 | query_cover=30 6 | subject_cover=30 7 | database=mitochondrion 8 | 9 | ## https://ftp.ncbi.nlm.nih.gov/refseq/release/mitochondrion/ 10 | ## https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/ 11 | ## https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/ 12 | 13 | ### index 14 | # diamond makedb --in mitochondrion.1.protein.faa -d mitochondrion --taxonnodes nodes.dmp --taxonnames names.dmp --taxonmap prot.accession2taxid.gz 15 | 16 | diamond blastp -d ${database} -q ${input_mit_pep} -o ${input_mit_pep}.guess_sp.txt -p ${cpu} --ultra-sensitive --evalue ${evalue} --quiet --matrix ${matrix} --masking 1 --comp-based-stats 1 --max-hsps 0 --query-cover ${query_cover} --subject-cover ${subject_cover} --outfmt 6 qseqid sseqid evalue pident staxids sscinames sphylums 17 | 18 | sort -k 1,1 -k 4nr,4 ${input_mit_pep}.guess_sp.txt | awk '!a[$1]++{print $0}' > ${input_mit_pep}.guess_sp2.txt 19 | -------------------------------------------------------------------------------- /Comparative_genomics/gene_family/02_hmm.sh: -------------------------------------------------------------------------------- 1 | hmmsearch --domtblout ./1_identify_gene_family/02_hmm/TIR.hmm.out ./00_data/TIR.hmm ./00_data/pep.fa 2 | grep -v '#' ./1_identify_gene_family/02_hmm/TIR.hmm.out | awk '($7 + 0) < 1e-05'|cut -f1 -d ' ' |sort -u > ./1_identify_gene_family/02_hmm/TIR.hmm_gene.id 3 | hmmsearch --domtblout ./1_identify_gene_family/02_hmm/LRR.hmm.out ./00_data/LRR.hmm ./00_data/pep.fa 4 | grep -v '#' ./1_identify_gene_family/02_hmm/LRR.hmm.out | awk '($7 + 0) < 1e-05'|cut -f1 -d ' ' |sort -u > ./1_identify_gene_family/02_hmm/LRR.hmm_gene.id 5 | find ./1_identify_gene_family/02_hmm -name '*_gene.id' -exec cat {} \; |sort |uniq -c |awk '$1 == 2 {print $2}' > ./1_identify_gene_family/02_hmm/hmm.id 6 | comm -12 ./1_identify_gene_family/02_hmm/hmm.id ./1_identify_gene_family/01_blastp/blast.filt.id > ./1_identify_gene_family/result/01_target_gene.id 7 | seqkit grep -f ./1_identify_gene_family/result/01_target_gene.id ./00_data/pep.fa > ./1_identify_gene_family/result/02_target_gene.pep 8 | -------------------------------------------------------------------------------- /genome/Anno_EGAPx/egapx.03.1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ## source python env with PyYAML Module in 3 | source /01_soft/egapx/egapx/bin/activate 4 | 5 | ## You can use singularity to pull the img for https://hub.docker.com/r/ncbi/egapx/tags, then put the ncbi-egapx-0.2-alpha.img in NXF_SINGULARITY_CACHEDIR 6 | export NXF_SINGULARITY_CACHEDIR=/01_soft/egapx-0.3.1-alpha/NXF_SINGULARITY_CACHEDIR 7 | export JAVA_HOME=/01_software/jdk-11.0.1 8 | export TMPDIR=$PWD 9 | 10 | file_path=local2.yaml 11 | outdir=Output 12 | main_script=/01_soft/egapx-0.3.1-alpha/ui/egapx.py 13 | workdir=$PWD/workdir 14 | 15 | [ -d egapx_config ] || mkdir -p egapx_config && cp /01_soft/egapx/egapx_config/singularity.config egapx_config/singularity.config 16 | 17 | ### the cache file you can download from https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/support_data/ 18 | python3 ${main_script} ${file_path} -e singularity -w ${workdir} -o ${outdir} -lc /01_soft/egapx-0.3.0-alpha/support_data 19 | 20 | ## rm tmp 21 | rm -rf ${workdir} .nextflow 22 | -------------------------------------------------------------------------------- /picture/syri_plotsv/base.demo.config: -------------------------------------------------------------------------------- 1 | /* ------------------------------------------------- 2 | * Nextflow config file 3 | * ------------------------------------------------- 4 | */ 5 | 6 | process { 7 | errorStrategy = { ( task.exitStatus == 143 || task.exitStatus == 137 ) ? 'retry' : 'finish' } 8 | maxRetries = 3 9 | maxErrors = '-1' 10 | 11 | withName: 'ALIGN.*' { 12 | cpus = {ABCD * task.attempt } 13 | memory = { EFGH.GB * task.attempt } 14 | time = { 24.h * task.attempt } 15 | } 16 | withName: 'FIXCHR.*|SYRI.*|PLOTSR.*' { 17 | cpus = {1 * task.attempt } 18 | memory = { 4.GB * task.attempt } 19 | time = { 24.h * task.attempt } 20 | } 21 | withName: 'SEQTK.*' { 22 | cpus = {1 * task.attempt } 23 | memory = { 2.GB * task.attempt } 24 | time = { 1.h * task.attempt } 25 | } 26 | /* 27 | withName: SEQKIT_GET_LENGTH { 28 | cpus = {1 * task.attempt } 29 | memory = { 1.GB * task.attempt } 30 | time = { 30.min * task.attempt } 31 | 32 | } 33 | */ 34 | } 35 | -------------------------------------------------------------------------------- /genome/Anno_RNA/minimap2/step3.sh: -------------------------------------------------------------------------------- 1 | export PATH=/01_software/TransDecoder-TransDecoder-v5.5.0/util/:$PATH 2 | export PERL5LIB=/01_software/TransDecoder-TransDecoder-v5.5.0/PerlLib:$PERL5LIB 3 | 4 | ### 5 | cat pfam.qsub/pfam.1.domtblout > pfam.domtblout 6 | for i in `seq 2 200` 7 | do 8 | less pfam.qsub/pfam.${i}.domtblout | grep -v '^#' >> pfam.domtblout 9 | done 10 | cat blast.qsub/*outfmt6 | awk '$3>60' > blastp.outfmt6 11 | 12 | ### 13 | transcript=transcripts.fasta 14 | /01_software/TransDecoder-TransDecoder-v5.5.0/TransDecoder.Predict -t ${transcript} --retain_pfam_hits pfam.domtblout --retain_blastp_hits blastp.outfmt6 15 | 16 | gtf_to_alignment_gff3.pl ${gtf} > ${gtf}.gff3 17 | cdna_alignment_orf_to_genome_orf.pl ${trans}.transdecoder.gff3 ${gtf}.gff3 ${trans} > ${trans}.transdecoder.genome.gff3 18 | awk '$3=="CDS" || $3=="mRNA" {print $0";"} ' ${trans}.transdecoder.genome.gff3 > ${trans}.transdecoder.genome.gff3.tmp 19 | Covert_for_evm.pl ${trans}.transdecoder.genome.gff3.tmp TransDecoder | awk '!a[$1"\t"$4"\t"$5]++{print $0}' > ${trans}.transdecoder.genome.gff3.forevm.gff3 20 | -------------------------------------------------------------------------------- /genome/assess/busco-5.5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ####atttention### 4 | 5 | # The odb must be odb10.(~/06_database/) 6 | # augustus_species ~/01_software/minimamba/envs/augustus/config/species 7 | # ~/06_database/specie.txt 8 | 9 | ####atttention### 10 | 11 | ### version 4.1.2 12 | #source activate ~/conda/envs/busco-4/ 13 | ### version 5.3 14 | #source activate ~/miniconda3/envs/busco5 15 | ### version 5.5.0 16 | source activate ~/home_micromamba/envs/busco5.5.0 17 | 18 | input=pep.fa 19 | cpu=5 20 | model=prot ##trans prot geno 21 | output=BUSCO 22 | evalue=1e-03 23 | species_model=zebrafish 24 | db=actinopterygii_odb10 25 | database=/~/06_database 26 | 27 | busco --offline -l ${database}/${db} -e ${evalue} -m ${model} -c ${cpu} -i ${input} -o ${output} --augustus_species ${species_model} 28 | 29 | rm -rf ${output}/logs/ ${output}/run_${db}/busco_sequences/ ${output}/run_${db}/hmmer_output/ ${output}/short_summary.specific.${db}.BUSCO.json ${output}/run_${db}/short_summary.json busco_downloads 30 | mv ${output}/run_${db}/* ${output}/ 31 | rm -rf ${output}/run_${db} 32 | -------------------------------------------------------------------------------- /genome/Anno_RNA/GMAP/map.sh: -------------------------------------------------------------------------------- 1 | 2 | reference=$PWD/reference 3 | species=HCS 4 | transcript=all.rename.cdhit99.trans.fa 5 | cpu=10 6 | min_identity=0.7 7 | max_intronlength_middle=20000 8 | total_intron_length=100000 9 | output_format=gff3_gene 10 | output_name=GMAP 11 | 12 | ### In order to handle program received signal SIGSEGV erro alone, split transcript fasta 13 | 14 | fastaDeal.pl --cuts 100 ${transcript} 15 | ls ${transcript}.cut > id 16 | for i in $(cat id) 17 | do 18 | echo "/gmap-2021-08-25/bin/gmap -D $reference -d ${species}_reference --min-identity ${min_identity} --canonical-mode 2 --max-intronlength-middle ${max_intronlength_middle} --totallength ${total_intron_length} -t $cpu --input-buffer-size=20 --output-buffer-size=20 --allow-close-indels=2 --tolerant --truncate --split-large-introns --suboptimal-score=0.9 -f $output_format $PWD/${transcript}.cut/${i} > $PWD/${transcript}.cut/${output_name}.${i}.gff 2> $PWD/${transcript}.cut/${output_name}.${i}.log" >> all.run.sh 19 | done 20 | 21 | ### Do not deliver tasks in parallel 22 | qsub -cwd -l vf=10G,p=10 -binding linear:10 -q XXX -P XXX all.run.sh 23 | -------------------------------------------------------------------------------- /Comparative_genomics/kaks/collinearity_kaks.sh: -------------------------------------------------------------------------------- 1 | ### https://github.com/reubwn/collinearity/tree/v1.0 2 | export PATH="/01_soft/MCScanX/:$PATH" 3 | export PATH="/01_soft/collinearity:$PATH" 4 | 5 | pep=2.gff.pep 6 | cds=2.gff.cds 7 | gff=2.gff 8 | cpu=30 9 | 10 | ## 11 | diamond makedb --in ${pep} -d ${pep} 12 | diamond blastp -e 1e-2 -p 8 -q ${pep} -d ${pep} -a ${pep}.vs.self 13 | diamond view -a ${pep}.vs.self.daa -o Xyz.blast 14 | awk '$3=="mRNA"' ${gff} | awk '{print $1"\t"$9"\t"$4"\t"$5}' | sed 's/ID=//g;s/;//g' > Xyz.gff 15 | 16 | ## 17 | [ -d result ] || mkdir result 18 | cp Xyz* result 19 | MCScanX result/Xyz 20 | duplicate_gene_classifier result/Xyz 21 | 22 | ## 23 | add_kaks_to_MCScanX.pl -i result/Xyz.collinearity -p ${pep} -c ${cds} -t ${cpu} 24 | calculate_collinearity_metric.pl -i result/Xyz.collinearity -g Xyz.gff -k result/Xyz.collinearity.kaks 25 | calculate_collinearity_breakpoints.pl -i result/Xyz.collinearity -g Xyz.gff -s result/Xyz.collinearity.score -k result/Xyz.collinearity.kaks -b 26 | calculate_collinearity_palindromes.pl -i result/Xyz.collinearity -g Xyz.gff -k result/Xyz.collinearity.kaks 27 | -------------------------------------------------------------------------------- /picture/synteny_circos/simpletolink.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # simple2links 3 | 4 | from sys import argv 5 | 6 | simple_file = argv[1] 7 | 8 | ref_bed = simple_file.split(".")[0] + ".bed" 9 | qry_bed = simple_file.split(".")[1] + ".bed" 10 | 11 | ref_dict = {line.split("\t")[3]:line.split("\t")[0:3] for line in open(ref_bed)} 12 | qry_dict = {line.split("\t")[3]:line.split("\t")[0:3] for line in open(qry_bed)} 13 | 14 | fo = open(simple_file + "_link.txt", "w") 15 | 16 | for line in open(simple_file): 17 | if line.startswith("#"): 18 | continue 19 | items = line.strip().split("\t") 20 | ref_start_gene = items[0] 21 | ref_end_gene = items[1] 22 | qry_start_gene = items[2] 23 | qry_end_gene = items[3] 24 | 25 | ref_chr, ref_start = ref_dict[ref_start_gene][0:2] 26 | ref_end = ref_dict[ref_end_gene][2] 27 | qry_chr, qry_start = qry_dict[qry_start_gene][0:2] 28 | qry_end = qry_dict[qry_end_gene][2] 29 | 30 | circos_input = [ref_chr, ref_start, ref_end, qry_chr, qry_start, qry_end] 31 | fo.writelines('\t'.join(circos_input) + '\n') 32 | 33 | fo.close() 34 | -------------------------------------------------------------------------------- /genome/Anno_homology/miniprot/miniprot.sh: -------------------------------------------------------------------------------- 1 | genome=EFGH.fa 2 | pep=pep.fa 3 | prefix=EFGH_ ### prefix for IDs in GFF3 4 | cpu=20 5 | max_intron_size=10k ### max intron size [200k] 6 | splice_model=1 ### splice model: 2=mammal, 1=general, 0=none (see Detail) [1] 7 | weight_of_splice_penalty=1 ### weight of splice penalty; 0 to ignore splice signals [1] 8 | 9 | miniprot -G ${max_intron_size} -j ${splice_model} -t ${cpu} --gff -P ${prefix} -C ${weight_of_splice_penalty} ${genome} ${pep} --outs=0.99 > EFGH.gff 10 | 11 | grep -A 1 "##PAF" EFGH.gff | awk '$1!~/--/' | paste - - | awk '($5-$4)/$3>0.9' | awk '{print $2"\t"$3"\t"$4"\t"$5"\t"$(NF-10)"\t"$(NF-6)"\t"$(NF-5)"\t"$(NF-4)"\t"$(NF-2)}' | sed 's/;/\t/g;s/ID=//g' > filter.info 12 | cat EFGH.gff | grep -v -e "^#" -e stop_codon | gffread -C -G -K -Q -Y -M --cset -d dup.log -H -V -P -N -Z - -g ${genome} -o EFGH.gff.gffread 13 | fishInWinter.pl -bf table -ff gff - EFGH.gff.gffread | awk '{print $0";"}' | sed "s/miniprot/miniprot_EFGH/1" > EFGH.gff.gffread.gff 14 | Covert_for_evm.pl EFGH.gff.gffread.gff miniprot_EFGH > EFGH.gff.gffread.gff.forevm.gff3 15 | -------------------------------------------------------------------------------- /genome/Anno_EGAPx/fix_mRNA_coordinate.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | ### By Sunshai (sunhai@genomic.cn) 4 | 5 | use strict; 6 | use warnings; 7 | 8 | my ($gff, $out) = @ARGV; 9 | 10 | open FL, $gff; 11 | my %end; 12 | while () { 13 | chomp; 14 | my @tmp = split; 15 | 16 | if ($tmp[2] eq 'CDS') { 17 | my $id = $1 if ($tmp[8] =~ /Parent=([^;\s]+)/); 18 | #print "$id\n"; 19 | if (!exists $end{$id}{'start'} or $tmp[3] <= $end{$id}{'start'}) { 20 | $end{$id}{'start'} = $tmp[3]; 21 | } 22 | if (!exists $end{$id}{'end'} or $tmp[4] >= $end{$id}{'end'}) { 23 | $end{$id}{'end'} = $tmp[4]; 24 | }; 25 | }; 26 | }; 27 | close FL; 28 | 29 | open FL, $gff; 30 | open FLS, ">$out"; 31 | while () { 32 | chomp; 33 | my @tmp = split; 34 | if ($tmp[2] eq 'mRNA') { 35 | my $id = $1 if ($tmp[8] =~ /ID=([^;\s]+)/); 36 | next if (!exists $end{$id}{'start'}); 37 | $tmp[3] = $end{$id}{'start'}; 38 | $tmp[4] = $end{$id}{'end'}; 39 | }; 40 | print FLS join("\t", @tmp), "\n"; 41 | }; 42 | close FLS; 43 | close FL; 44 | -------------------------------------------------------------------------------- /genome/Anno_integrate/evm_auto.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | species=S_japo 4 | genome=genome.fa 5 | cpu=48 6 | weight=weights.txt 7 | denovo=denovo.gff 8 | pep=pep.gff 9 | RNA=RNA.gff 10 | repeats=all.repeat.gff 11 | segmentSize=1000000 12 | overlapSize=200000 13 | min_intron_length=20 14 | 15 | export PERL5LIB=/01_software/TransDecoder-TransDecoder-v5.5.0/PerlLib:$PERL5LIB 16 | 17 | cat ${pep} ${RNA} ${denovo} | grep -v "^#" | cut -f 1 | sed '/^\s*$/d' | awk '!a[$1]++' | seqtk subseq -l 100 ${genome} - > ${genome}.filter.fa 18 | 19 | /01_software/EVidenceModeler-v2.1.0/EVidenceModeler \ 20 | --sample_id ${species} \ 21 | --genome ${genome}.filter.fa \ 22 | --weights ${weight} \ 23 | --gene_predictions ${denovo} \ 24 | --protein_alignments ${pep} \ 25 | --transcript_alignments ${RNA} \ 26 | --segmentSize ${segmentSize} \ 27 | --overlapSize ${overlapSize} \ 28 | --CPU ${cpu} \ 29 | --repeats ${repeats} \ 30 | --min_intron_length ${min_intron_length} 31 | -------------------------------------------------------------------------------- /deal_gff/pick_longest_gene/fix_mRNA_coordinate.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | ### By Sunshai (sunhai@genomic.cn) 4 | 5 | use strict; 6 | use warnings; 7 | 8 | my ($gff, $out) = @ARGV; 9 | 10 | open FL, $gff; 11 | my %end; 12 | while () { 13 | chomp; 14 | my @tmp = split; 15 | 16 | if ($tmp[2] eq 'CDS') { 17 | my $id = $1 if ($tmp[8] =~ /Parent=([^;\s]+)/); 18 | #print "$id\n"; 19 | if (!exists $end{$id}{'start'} or $tmp[3] <= $end{$id}{'start'}) { 20 | $end{$id}{'start'} = $tmp[3]; 21 | } 22 | if (!exists $end{$id}{'end'} or $tmp[4] >= $end{$id}{'end'}) { 23 | $end{$id}{'end'} = $tmp[4]; 24 | }; 25 | }; 26 | }; 27 | close FL; 28 | 29 | open FL, $gff; 30 | open FLS, ">$out"; 31 | while () { 32 | chomp; 33 | my @tmp = split; 34 | if ($tmp[2] eq 'mRNA') { 35 | my $id = $1 if ($tmp[8] =~ /ID=([^;\s]+)/); 36 | next if (!exists $end{$id}{'start'}); 37 | $tmp[3] = $end{$id}{'start'}; 38 | $tmp[4] = $end{$id}{'end'}; 39 | }; 40 | print FLS join("\t", @tmp), "\n"; 41 | }; 42 | close FLS; 43 | close FL; 44 | -------------------------------------------------------------------------------- /genome/Anno_RNA/minimap2/step2.sh: -------------------------------------------------------------------------------- 1 | trans=transcripts.fasta 2 | cpu=2 3 | ### /06_database/SwissProt/uniprot_sprot 4 | blastp_db=/01_genome/Jipidongwu_db/jipidongwu 5 | pfam_db=/06_database/Pfam/Pfam-A.hmm 6 | blastp_soft=/01_soft/ncbi-blast-2.15.0+/bin/blastp 7 | diamond_soft=/00_tools/diamond 8 | evalue=1e-3 9 | hmmscan_soft=/01_soft/mambaforge/bin/hmmscan 10 | 11 | ln -s ${trans}.transdecoder_dir/longest_orfs.pep . 12 | fastaDeal.pl --cutf 200 longest_orfs.pep 13 | 14 | mkdir blast.qsub pfam.qsub 15 | for i in `seq 1 200` 16 | do 17 | #echo "${blastp_soft} -query ../longest_orfs.pep.cut/longest_orfs.pep.${i} -db ${blastp_db} -max_target_seqs 1 -outfmt 6 -evalue ${evalue} -num_threads ${cpu} > blastp.${i}.outfmt6 ; echo done " > blast.qsub/blast.${i}.sh 18 | echo "diamond blastp --evalue ${evalue} --outfmt 6 -d ${blastp_db} -q ../longest_orfs.pep.cut/longest_orfs.pep.${i} -o blastp.${i}.outfmt6 --threads $cpu --max-target-seqs 5 --more-sensitive -b 0.5 " > blast.qsub/blast.${i}.sh 19 | echo "${hmmscan_soft} --cpu ${cpu} --domtblout pfam.${i}.domtblout ${pfam_db} ../longest_orfs.pep.cut/longest_orfs.pep.${i} ; echo done " > pfam.qsub/pfam.${i}.sh 20 | done 21 | -------------------------------------------------------------------------------- /Comparative_genomics/gene_family/04_final_gene_family.sh: -------------------------------------------------------------------------------- 1 | cp ./00_data/final_gene_id ./1_identify_gene_family/04_final_gene_family/final_gene_id 2 | seqkit grep -f ./1_identify_gene_family/04_final_gene_family/final_gene_id ./00_data/pep.fa > ./1_identify_gene_family/04_final_gene_family/final_gene_protein 3 | seqkit grep -f ./1_identify_gene_family/04_final_gene_family/final_gene_id ./00_data/cds.fa > ./1_identify_gene_family/04_final_gene_family/final_gene_cds 4 | seqkit fx2tab --length --name ./1_identify_gene_family/04_final_gene_family/final_gene_cds | awk '{print $1, $2}'> ./1_identify_gene_family/04_final_gene_family/final_gene_cds_length 5 | grep 'CDS' ./00_data/C_albu.bgi.gff| cut -f9 | cut -d ';' -f1 | cut -d '=' -f2 | sort | uniq -c|awk '{print $2, $1}' > ./1_identify_gene_family/04_final_gene_family/gene_cds_number 6 | grep -f ./1_identify_gene_family/04_final_gene_family/final_gene_id ./1_identify_gene_family/04_final_gene_family/gene_cds_number |sort >./1_identify_gene_family/04_final_gene_family/final_gene_cds_number 7 | pfam_scan.pl -fasta ./1_identify_gene_family/04_final_gene_family/final_gene_protein -dir ./Pfam/ > ./1_identify_gene_family/04_final_gene_family/final_gene_domain 8 | -------------------------------------------------------------------------------- /other/outlier2.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import pandas as pd 4 | 5 | def remove_outliers_iqr(data_series, m=1): 6 | q1 = data_series.quantile(0.25) 7 | q3 = data_series.quantile(0.75) 8 | iqr = q3 - q1 9 | lower_bound = q1 - m * iqr 10 | upper_bound = q3 + m * iqr 11 | return data_series[(data_series >= lower_bound) & (data_series <= upper_bound)] 12 | 13 | def main(input_file, output_file): 14 | # 读取数据文件 15 | data = pd.read_csv(input_file, sep='\t', header=None) 16 | 17 | # 假设第一列是样本名,第二列是数值 18 | sample_names = data[0] 19 | values = data[1] 20 | 21 | # 剔除离群值 22 | cleaned_values = remove_outliers_iqr(values) 23 | 24 | # 创建一个新的DataFrame来保存结果 25 | cleaned_data = pd.DataFrame({'Sample': sample_names, 'Value': cleaned_values}) 26 | 27 | # 将结果保存到新的文件 28 | cleaned_data.to_csv(output_file, sep='\t', index=False) 29 | 30 | if __name__ == "__main__": 31 | if len(sys.argv) != 3: 32 | print("Usage: python script.py ") 33 | sys.exit(1) 34 | 35 | input_file_path = sys.argv[1] 36 | output_file_path = sys.argv[2] 37 | 38 | main(input_file_path, output_file_path) 39 | -------------------------------------------------------------------------------- /picture/box/box.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(readr) 3 | 4 | args <- commandArgs (T) 5 | 6 | file_path <- args[1] 7 | x_column <- args[2] 8 | y_column <- args[3] 9 | 10 | # 读取数据,跳过标题行 11 | data <- read_delim(file_path, col_names = TRUE) 12 | 13 | # 检查列是否存在 14 | if (!(x_column %in% colnames(data)) || !(y_column %in% colnames(data))) { 15 | stop("One or both of the specified columns do not exist in the data.") 16 | } 17 | 18 | # 选择您想要分析的两列 19 | selected_data <- data[, c(x_column, y_column)] 20 | 21 | # 检查选择的数据 22 | #head(selected_data) 23 | 24 | # 为渐变颜色定义颜色方案 25 | color_scheme <- colorRampPalette(c("blue", "red"))(100) # 生成100个渐变颜色 26 | 27 | pdf(args[4], width = 15, height = 6) 28 | 29 | # 使用ggplot2绘制箱线图和小提琴图的组合图 30 | p <- ggplot(selected_data, aes_string(x = x_column, y = y_column)) + 31 | # geom_violin(trim = FALSE, fill = color_scheme[1]) + # 绘制小提琴图,使用渐变颜色 32 | stat_boxplot(geom = "errorbar",width=0.3) + 33 | geom_boxplot(width = 0.5, fill = color_scheme[2], outlier.fill = "grey", outlier.shape = 21) + # 绘制箱线图,使用渐变颜色 34 | labs(x = x_column, y = y_column) + 35 | theme_classic() + # 使用简洁主题 36 | # theme_minimal() + 37 | theme(axis.text.x = element_text(angle = 45, hjust = 1)) # 旋转X轴标签以便阅读 38 | p 39 | -------------------------------------------------------------------------------- /genome/Hic/all_hic.sh: -------------------------------------------------------------------------------- 1 | all_hic主要针对于多倍体以及高杂合度基因组进行挂载hic,以下方法是利用all_hic对二倍体简单基因组进行挂载(针对于复杂基因组之后有空进行补充) 2 | 1、下载all_hic软件 3 | ------------------------------------------------- 4 | $ git clone https://github.com/tangerzhang/ALLHiC 5 | $ cd ALLHiC 6 | $ chmod +x bin/* 7 | $ chmod +x scripts/* 8 | ------------------------------------------------- 9 | 2、运行 all_hic 中 ALLHiC_pip.sh ,脚本中需要依赖 samtools 和 bwa(可以在ALLHiC_pip.sh中自行修改依赖软件的路径) 10 | 注:写脚本时需要export PATH=/path/ALLHiC/scripts/:/path/ALLHiC/bin:$PATH 11 | ------------------------------------------------- 12 | Usage: ALLHiC_pip.sh -r reference -1 R1.fq -2 R2.fq -k group_count [-e enzyme] [-t threads] [-b bin_size] 13 | -r: reference genome 14 | -1: Lib_R1.fq.gz 15 | -2: Lib_R2.fq.gz 16 | -k: group_count 17 | -e: enzyme_sites (HindIII: AAGCTT; MboI: GATC), default: HindIII 18 | -t: threads, default: 10 19 | -b: bin_size for hic heatmap, can be divided with comma, default: 500k 20 | ------------------------------------------------- 21 | 脚本报错的话就选择分部执行 22 | 一般报错原因是因为sorted.bam文件中有多余HD注释行可以用以下命令继续处理,然后再继续执行后面步骤即可: 23 | samtools view -h sorted.bam | sed -e '/@HD\tVN:1.5\tSO:unsorted\tGO:query/d' | samtools view -b -o deal_sorted.bam 24 | -------------------------------------------------------------------------------- /transcriptome/Enrich/make_Orgdb.sh: -------------------------------------------------------------------------------- 1 | 2 | genus=Hypophthalmichthys 3 | species=nobilis 4 | taxid=7965 5 | name=lishuo 6 | gene2path=gene2pathway.txt 7 | 8 | ## EggNog annotation file and gene2pathway.txt 9 | sed '1,4d' eggnog-result.emapper.annotations | sed 's/^#//g' | grep -v "#" | csvtk replace -t -F -f "GOs" -p "(-)" -r "NA" | csvtk replace -t -F -f "KEGG_ko" -p "(-)" -r "NA" | csvtk replace -t -F -f "KEGG_Pathway" -p "(-)" -r "NA" | csvtk replace -t -F -f "eggNOG_OGs" -p "(-)" -r "NA" > eggNog.anno.txt 10 | 11 | /micromamba/envs/R_env/bin/Rscript AnnotationForge_20250117.R -i eggNog.anno.txt -a ${name} -m shiyeyishang@outlook.com -g ${genus} -s ${species} -d ${taxid} 12 | DB=$(ls | grep eg.db) 13 | sed -i 's/shiyeyishang\@outlook.com/lishuo \/g' ${DB}/DESCRIPTION 14 | /dellfsqd2/ST_OCEAN/USER/lishuo11/09_test/zz_tmp/home_micromamba/envs/R_env/bin/R CMD build $DB 15 | 16 | [ -d R_lib ] || mkdir R_lib 17 | DBgz=$(ls | grep eg.db | grep "tar.gz") 18 | /micromamba/envs/R_env/bin/R CMD INSTALL ${DBgz} --library=$PWD/R_lib 19 | 20 | ## deal gene2pathway.txt 21 | sed '1d' ${gene2path} | csvtk -t add-header -n GID,Pathway,Name > gene2pathway_forClusterProfiler.txt 22 | 23 | rm -rf ${DB} ${DBgz} eggNog.anno.txt 24 | -------------------------------------------------------------------------------- /Comparative_genomics/deal_tree_nwk/draw.r: -------------------------------------------------------------------------------- 1 | #!/R-4.2/bin/Rscript 2 | 3 | ### https://cn.bio-protocol.org/bio101/e1010674 4 | ### https://cran.r-project.org/web/packages/ape/ape.pdf 5 | ### choose libPaths ### 6 | .libPaths("/R-4.2/lib/R/library/") 7 | 8 | args <- commandArgs (T) 9 | library(ape) 10 | 11 | tree <- read.tree(args[1]) 12 | pdf(paste(args[2], ".pdf", sep = "", collapse = ""), height = 15 ) 13 | 14 | par(mfrow = c(4, 2)) 15 | plot(tree, type = "p", main = "phylogram with branch lengths", sub = "A", use.edge.length = TRUE, tip.color = rainbow(5)) 16 | plot(tree, type = "p", main = "phylogram without branch lengths", sub = "B", use.edge.length = FALSE, edge.width = 1:27/2) 17 | plot(tree, type = "c", main = "cladogram", sub = "C", edge.color = rainbow(27)) 18 | plot(tree, type = "f", main = "fan", sub = "D", font = 3) 19 | plot(tree, type = "u", main = "unrooted", sub = "E") 20 | plot(tree, type = "r", main = "radial", sub = "F") 21 | 22 | plot(tree, type = "p", main = "phylogram with branch lengths", sub = "G", use.edge.length = TRUE, edge.width = 2) 23 | nodelabels(bg = "lightgray", frame = "c") 24 | 25 | plot(tree, type = "p", main = "phylogram with branch lengths", sub = "G", use.edge.length = TRUE, edge.width = 2) 26 | edgelabels() 27 | -------------------------------------------------------------------------------- /Comparative_genomics/blast/reciprocal_best_hits.sh: -------------------------------------------------------------------------------- 1 | ### https://github.com/glarue/reciprologs 2 | ### https://rdrr.io/github/drostlab/homologr/man/diamond_reciprocal_best_hits.html 3 | 4 | ### !!!!!!!!!! 5 | ### Attention: cp pep.fa workdir/ !!!!!!! Don't link !!!!!!!! Or blast_db will be generated in raw pep.fa dir !!!!! 6 | ### !!!!!!!!!! 7 | 8 | ### mmseqs 1:1 or 1:many 9 | mmseqs easy-rbh cse.pep dre.pep mmseqs.rbh.txt tmp --threads 5 10 | cut -f 1,2 mmseqs.rbh.txt | awk 'NR==FNR{a[$1]=$0}NR!=FNR{print $0"\t"a[$1]}' cse.gene - | csvtk -t cut -f 2,1,5 - | guanlian2 dre.gene - | csvtk -t cut -f 2,1,3,6- > mmseqs.rbh.txt.diff 11 | 12 | ### blast_rbh.py 13 | blast_rbh.py --threads=5 -c 30 -i 50 -a prot -t blastp -o blast_rbh.tsv cse.pep dre.pep 14 | 15 | ### reciprologs && diamond && networkx 16 | reciprologs -p 5 --chain -q 30 -o diamond.rbh.txt --one_to_one --logging cse.pep dre.pep diamondp 17 | #reciprologs -p 10 --chain -q 30 -o diamond_more.rbh.txt cse.pep dre.pep diamondp 18 | 19 | ### reciprologs && blastp && networkx 20 | reciprologs -p 10 --chain -q 30 -o BLASTP.rbh.txt --one_to_one --logging cse.pep dre.pep blastp 21 | 22 | ### diamond && evalue 23 | diamond_rbh.R -a cse.fa -b dre.fa -c 10 -e 1E-3 -m 5 -M ultra-sensitive -o diamond.R.rbh.csv 24 | -------------------------------------------------------------------------------- /Comparative_genomics/gene_family/03_miniprot.sh: -------------------------------------------------------------------------------- 1 | blastp -query ./00_data/TLR_protein.fasta -subject ./1_identify_gene_family/result/02_target_gene.pep -evalue 1e-05 -seg yes -outfmt "6 qseqid"| sort -u > ./1_identify_gene_family/03_miniprot/new_gene_protein_id 2 | seqkit grep -f ./1_identify_gene_family/03_miniprot/new_gene_protein_id ./00_data/TLR_protein.fasta > ./1_identify_gene_family/03_miniprot/new_gene_protein_fasta 3 | cat ./1_identify_gene_family/result/02_target_gene.pep >> ./1_identify_gene_family/03_miniprot/new_gene_protein_fasta 4 | miniprot --gff -I ./00_data/C_albu.fa ./1_identify_gene_family/03_miniprot/new_gene_protein_fasta > ./1_identify_gene_family/03_miniprot/miniprot.gff 5 | awk '{if($0 ~ /^##/ && $11 != "" && $12 != "" && ($11 + 0)/($12 + 0) > 0.8) {found=1; next} else {if ($0 ~ /^##/) {found = 0}}} /mRNA/ {if (found == 1) print} /CDS/ {if (found == 1) print}' ./1_identify_gene_family/03_miniprot/miniprot.gff > ./1_identify_gene_family/03_miniprot/miniprot.filt.gff 6 | gffread -C -G -K -Q -Y -M -d dup ./1_identify_gene_family/03_miniprot/miniprot.filt.gff > ./1_identify_gene_family/03_miniprot/miniprot.filt.gff2 7 | gffread ./1_identify_gene_family/03_miniprot/miniprot.filt.gff2 -g ./00_data/C_albu.fa -x ./1_identify_gene_family/03_miniprot/pre.cds -------------------------------------------------------------------------------- /genome/pseudogenes/step2.sh: -------------------------------------------------------------------------------- 1 | ## https://github.com/kelkar/Discover_pseudogenes 2 | 3 | protein=use.pep 4 | cds=use.cds 5 | genome=genome.mask.for_anno.fa.cut/ABCD 6 | output_prefix=pse_ABCD 7 | Percent=60 8 | 9 | [ -d TMP ] || mkdir TMP 10 | 11 | ### step1 12 | exonerate --percent ${Percent} --model protein2genome --showquerygff yes --showtargetgff yes -q ${protein} -t ${genome} --ryo "RYO\t%qi\t%ti\t%ql\t%tl\t%qal\t%qab\t%qae\t%tal\t%tab\t%tae\t%et\t%ei\t%es\t%em\t%pi\t%ps\t%g\nTransitionStart\n%V{%Pqs\t%Pts\t%Pqb\t%Pqe\t%Ptb\t%Pte\t%Pn\t%Pl\n}TransitionEnd\nTargetSeq\n%qs\nAligned Sequences\n>Q\n%qas\n>T\n%tas\nCoding Sequences\n>Q\n%qcs\n>T\n%tcs\n" > TMP/${output_prefix}.exonerate.txt 13 | 14 | ### step2 15 | grep -P "Query:|Target:|(Query range:)|(Target range:)|(^[A-Z]\tTAA\t)|(\sframeshift\s)|(^[A-Z]\tTAG\t)|(^[A-Z]\tTGA\t)|(^\*)" TMP/${output_prefix}.exonerate.txt > TMP/${output_prefix}.exonerate.erros.txt 16 | 17 | ### step3 18 | perl /exonerate-2.2.0-x86_64/bin/Exonerate_to_evm_gff3.pl TMP/${output_prefix}.exonerate.txt > TMP/${output_prefix}.exonerate.gff 19 | 20 | ### step4 21 | perl /exonerate-2.2.0-x86_64/bin/tabulate_stops_frameshifts.pl ${cds} TMP/${output_prefix}.exonerate.gff TMP/${output_prefix}.exonerate.erros.txt > TMP/${output_prefix}.pseudogenes.txt 22 | -------------------------------------------------------------------------------- /Comparative_genomics/gene_family_cluster/sonicparanoid2.sh: -------------------------------------------------------------------------------- 1 | source activate /home_micromamba/envs/sonicparanoid2/ 2 | 3 | cwd=$PWD 4 | indir=$cwd/data 5 | outdir=$cwd/sonicparanoid2_out 6 | cpu=30 7 | prefix=3spe 8 | MIN_ARCH_MERGING_COV=0.75 ## When merging graph- and arch-based orhtologs consider only new-orthologs with a protein coverage greater or equal than this value. 9 | INFLATION=1.5 10 | MIN_BITSCORE=40 ## Consider only alignments with bitscores above min-bitscore. 11 | ## Increasing this value can be a good idea when comparing very closely related species. 12 | ## Increasing this value will reduce the number of paralogs (and orthologs) generate. 13 | ## higher min-bitscore values reduce the execution time for all-vs-all. Default=40 14 | 15 | sonicparanoid -i $indir -o $outdir -p $prefix -t $cpu -m sensitive -at -ka -ca -op -d --min-arch-merging-cov ${MIN_ARCH_MERGING_COV} -bs ${MIN_BITSCORE} --inflation ${INFLATION} 16 | 17 | mv $outdir/runs/$prefix/* $outdir 18 | mv $outdir/ortholog_groups $outdir/$prefix 19 | 20 | ### remove tmp file !!!!!!!!!!!!!!!!!!!!! 21 | rm -rf $outdir/runs $outdir/orthologs_db $outdir/alignments $outdir/arch_orthology $outdir/merged_tables 22 | -------------------------------------------------------------------------------- /Comparative_genomics/get_gene_infomation/deal.sh: -------------------------------------------------------------------------------- 1 | cat All.RAW.txt | sed "s/\"//g" | grep -v "total_count: 0" | sed 's/description:/\tdescription:/1;s/,gene_id:/\tgene_id:/1;s/,protein_count:/\tprotein_count:/1;s/,symbol:/\tsymbol:/1;s/,tax_id:/\ttax_id:/1;s/,taxname:/\ttaxname:/1;s/,transcript_count:/\t/1' | cut -f 2- | sed 's/,cds:/\tcds:/1' | cut -f 1-6,8- | sed "s/:{accession_version:/:/1;s/\[//g;s/\]//g" | sed "s/:{begin:/:/1" | sed 's/}},/\t/1;s/,end:/___/1' | sed 's/genomic_accession_version:/\tgenomic_accession_version:/1' | sed "s/:{accession_version:/:/1;s/\[//g;s/\]//g" | sed "s/:{begin:/:/1" | sed 's/}},/\t/1;s/,end:/___/1' | sed 's/genomic_accession_version:/\tgenomic_accession_version:/1' | cut -f 1-7,9- | sed "s/:{begin:/:/1;s/,end:/___/1" | sed "s/},sequence_name/\tsequence_name/1" | sed 's/},/\t/1' | sed 's/,query:/\t/1;s/},total_count:/\t/g' | awk -F "\t" '{print $2"\t"$(NF-1)"\t"$4"\t"$1"\t"$3"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}' | sed 's/description://g;s/symbol://g;s/gene_id://g;s/protein_count://g;s/tax_id://g;s/taxname://g;s/cds://g;s/genomic_accession_version://g;s/sequence_name://g;s/\t\t/\t/g' | sed "s/ /________/g" | csvtk add-header -t -n gene_id,pep_id,symbol,description,Alternative_splicing_count,tax_id,species,cds_infomation,chromosome_information,chromosome_information2 | sed 's/________/ /g' > All.tsv 2 | -------------------------------------------------------------------------------- /genome/pseudogenes/step1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set +o posix 3 | set -eo pipefail 4 | 5 | protein=use.pep 6 | cds=use.cds 7 | genome=HCS.fa 8 | output_prefix=hcs 9 | cpu=48 10 | 11 | ## find candicate region 12 | miniprot -t ${cpu} --gff -P ${output_prefix} ${genome} ${protein} --outs=0.6 > miniprot.gff 13 | 14 | ## make bed of candicate region and flank 100k 15 | samtools faidx ${genome} 16 | cut -f 1,2 ${genome}.fai | sort -k 1,1 > ${genome}.genome.size 17 | awk '$3=="mRNA"' miniprot.gff | cut -f 1,4,5 | sort -k 1,1 -k 2n,2 | bedtools flank -i - -g ${genome}.genome.size -b 100000 | sort -k 1,1 -k 2n,2 | bedtools merge -i - | sort -k 1,1 -k 2n,2 | bedtools complement -i - -g ${genome}.genome.size | sort -k 1,1 -k 2n,2 > miniprot.flank_100k.complement.bed 18 | 19 | ## mask genome 20 | bedtools maskfasta -fi ${genome} -fo ${genome}.mask.fa -bed miniprot.flank_100k.complement.bed 21 | 22 | ## split genome 23 | cut -f 1 miniprot.gff | grep -v "#" | awk '!a[$0]++' | seqtk subseq ${genome}.mask.fa - | seqkit seq -w 60 > ${genome}.mask.for_anno.fa 24 | rm ${genome}.fai ${genome}.mask.fa 25 | fastaDeal.pl --cuts 1 ${genome}.mask.for_anno.fa 26 | ls ${genome}.mask.for_anno.fa.cut/ > chr.id 27 | 28 | ## remove tmp 29 | rm ${genome}.fai ${genome}.mask.fa miniprot.flank_100k.complement.bed ${genome}.genome.size ${genome}.mask.for_anno.fa 30 | pigz --best miniprot.gff 31 | -------------------------------------------------------------------------------- /genome/Anno_integrate/annotCompare.config: -------------------------------------------------------------------------------- 1 | 2 | ## templated variables to be replaced exist as <__var_name__> 3 | 4 | # Pathname of an SQLite database 5 | # If the environment variable DSN_DRIVER=mysql then it is the name of a MySQL database 6 | DATABASE=/workdir/new.db 7 | 8 | ####################################################### 9 | # Parameters to specify to specific scripts in pipeline 10 | # create a key = "script_name" + ":" + "parameter" 11 | # assign a value as done above. 12 | 13 | 14 | #script cDNA_annotation_comparer.dbi 15 | cDNA_annotation_comparer.dbi:--MIN_PERCENT_OVERLAP=50 16 | cDNA_annotation_comparer.dbi:--MIN_PERCENT_PROT_CODING=30 17 | cDNA_annotation_comparer.dbi:--MIN_PERID_PROT_COMPARE=60 18 | cDNA_annotation_comparer.dbi:--MIN_PERCENT_LENGTH_FL_COMPARE=60 19 | cDNA_annotation_comparer.dbi:--MIN_PERCENT_LENGTH_NONFL_COMPARE=60 20 | cDNA_annotation_comparer.dbi:--MIN_FL_ORF_SIZE=<__MIN_FL_ORF_SIZE__> 21 | cDNA_annotation_comparer.dbi:--MIN_PERCENT_ALIGN_LENGTH=50 22 | cDNA_annotation_comparer.dbi:--MIN_PERCENT_OVERLAP_GENE_REPLACE=70 23 | cDNA_annotation_comparer.dbi:--STOMP_HIGH_PERCENTAGE_OVERLAPPING_GENE=<__STOMP_HIGH_PERCENTAGE_OVERLAPPING_GENE__> 24 | cDNA_annotation_comparer.dbi:--TRUST_FL_STATUS=<__TRUST_FL_STATUS__> 25 | cDNA_annotation_comparer.dbi:--MAX_UTR_EXONS=<__MAX_UTR_EXONS__> 26 | cDNA_annotation_comparer.dbi:--GENETIC_CODE=<__GENETIC_CODE__> 27 | 28 | -------------------------------------------------------------------------------- /genome/assess/CRAQ.sh: -------------------------------------------------------------------------------- 1 | ### https://github.com/JiaoLaboratory/CRAQ.git 2 | ### https://mp.weixin.qq.com/s/Qqj6AlgyImW9U9tTKF-3Cw 3 | 4 | ### AQI > 90, reference quality; AQI from 80-90, high quality; AQI from 60-80, draft quality; and AQI < 60, low quality. 5 | 6 | genome=YZ.asm.hic.p_ctg.fasta 7 | sms_fq=yz.fastq.gz 8 | ngs_fq=YZ-C-3_R1.fq.gz,YZ-C-3_R2.fq.gz 9 | cpu=20 10 | outdir=Result_CRAQ 11 | 12 | /01_soft/CRAQ/bin/craq -g ${genome} -sms ${sms_fq} -ngs ${ngs_fq} -x map-hifi --plot T --break T --thread ${cpu} --output_dir ${outdir} 13 | 14 | ### Get user-specified regional(i.e. window=50000) AQI score. 15 | cat ${outdir}/runAQI_out/strER_out/out_final.CSE.bed ${outdir}/runAQI_out/locER_out/out_final.CRE.bed > ${outdir}/runAQI_out/CRE_CSE.bed 16 | window=50000 17 | perl /01_soft/CRAQ/src/regional_AQI.pl ${outdir}/seq.size ${window} ${window} ${outdir}/runAQI_out/CRE_CSE.bed > ${outdir}/runAQI_out/plot_AQI.out 18 | ### plot. the scaffolds ids you want to present is ok. see CRAQcircos.py --help 19 | python /01_soft/CRAQ/src/CRAQcircos.py --genome_size ${outdir}/seq.size --genome_error_loc ${outdir}/runAQI_out/CRE_CSE.bed --genome_score ${outdir}/runAQI_out/plot_AQI.out --output ${outdir}/runAQI_out/plot_AQI.out.pdf 20 | 21 | ### !!!! remove intermediate files !!!! 22 | # rm -rf ${outdir}/SRout/*sort* ${outdir}/SRout/*tmp ${outdir}/SRout/Nonmap.loc ${outdir}/LRout/*sort* ${outdir}/LRout/Nonmap.loc 23 | -------------------------------------------------------------------------------- /other/outlier.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pandas as pd 3 | 4 | def remove_outliers_std(data_series, std_multiplier=3): 5 | """ 6 | 使用标准差方法剔除离群值。 7 | :param data_series: pandas.Series,包含数值数据。 8 | :param std_multiplier: 标准差的倍数,用于确定离群值的阈值,默认为3。 9 | :return: 剔除离群值后的Series。 10 | """ 11 | mean = data_series.mean() 12 | std_dev = data_series.std() 13 | lower_bound = mean - std_multiplier * std_dev 14 | upper_bound = mean + std_multiplier * std_dev 15 | return data_series[(data_series >= lower_bound) & (data_series <= upper_bound)] 16 | 17 | def main(input_file, output_file): 18 | # 读取数据文件 19 | data = pd.read_csv(input_file, sep='\t') 20 | 21 | # 假设第一列是样本名,第二列是数值 22 | sample_names = data.iloc[:, 0] 23 | values = data.iloc[:, 1] 24 | 25 | # 剔除离群值 26 | cleaned_values = remove_outliers_std(values) 27 | 28 | # 创建一个新的DataFrame来保存结果 29 | cleaned_data = pd.DataFrame({'Sample': sample_names, 'Values': cleaned_values}) 30 | 31 | # 将结果保存到新的文件 32 | cleaned_data.to_csv(output_file, sep='\t', index=False) 33 | print(f"离群值已剔除,结果已保存到 {output_file}") 34 | 35 | if __name__ == "__main__": 36 | if len(sys.argv) != 3: 37 | print("Usage: python script.py ") 38 | sys.exit(1) 39 | 40 | input_file_path = sys.argv[1] 41 | output_file_path = sys.argv[2] 42 | 43 | main(input_file_path, output_file_path) 44 | -------------------------------------------------------------------------------- /genome/Hic/haphic/haphic.sh: -------------------------------------------------------------------------------- 1 | ### https://github.com/zengxiaofei/HapHiC 2 | 3 | micromamba activate haphic 4 | 5 | export PATH="/01_soft/samblaster/:$PATH" 6 | export PATH="/01_soft/HapHiC/:$PATH" 7 | export PATH="/01_soft/HapHiC/utils/:$PATH" 8 | 9 | genome=$PWD/YZ.keep.fa 10 | hic1=$PWD/YZ_clean.R1.fq.gz 11 | hic2=$PWD/YZ_clean.R2.fq.gz 12 | cpu=20 13 | chromosome_num=23 14 | 15 | #bwa index ${genome} 16 | bwa mem -5SP -t ${cpu} ${genome} ${hic1} ${hic2} | samblaster | samtools view - -@ ${cpu} -S -h -b -F 3340 -o HiC.bam 17 | filter_bam HiC.bam 1 --nm 3 --threads ${cpu} | samtools view - -b -@ ${cpu} -o HiC.filtered.bam 18 | 19 | # pipline 20 | # haphic pipeline ${genome} HiC.filtered.bam ${chromosome_num} --max_inflation 10 --threads ${cpu} 21 | 22 | # step1 23 | haphic cluster --threads ${cpu} --max_inflation 10 ${genome} HiC.filtered.bam ${chromosome_num} 24 | 25 | # step2 26 | x=`grep "recommend_inflation" HapHiC_cluster.log | awk -F "inflation from" '{print $2}' | awk -F " " '{print $1}'` 27 | haphic reassign --nclusters ${chromosome_num} --threads ${cpu} ${genome} full_links.pkl inflation_$x/mcl_inflation_$x.clusters.txt paired_links.clm 28 | 29 | # step3 30 | haphic sort ${genome} HT_links.pkl split_clms final_groups/group*.txt --processes ${cpu} 31 | 32 | # step4 33 | haphic build ${genome} ${genome} HiC.filtered.bam final_tours/group*.tour 34 | 35 | # plot 36 | haphic plot scaffolds.raw.agp HiC.filtered.bam 37 | -------------------------------------------------------------------------------- /genome/puerge_halp/kmerdup/step3_filter.sh: -------------------------------------------------------------------------------- 1 | export PATH="/01_soft/kmerDedup/:$PATH" 2 | 3 | prefix=YZ 4 | mer_len=19 5 | work_dir=$PWD 6 | bam_dir=${work_dir}/mapping 7 | cpu=30 8 | 9 | grep "" shell/*sh.e* | sed 's/:/\t/g;s/\//\t/g;s/.fa.sh./\t/g' | cut -f 2,4 > alignment_rate.txt 10 | ls ${bam_dir}/*.bam > bam.list 11 | samtools merge -@ ${cpu} -n -f -b bam.list ${prefix}.keymer_map.bam 12 | BamDeal statistics Coverage -i ${prefix}.keymer_map.bam -r ${prefix}.format.fa -q 0 -o ${prefix}.cov 13 | 14 | ## -mpr max duplication percentage [0.3] 15 | ## -mcv min k-mer coverage(%) [30] 16 | ## -mode <1/2> 1:ratio only; 2:ratio * cov [2] 17 | 18 | perl /01_soft/kmerDedup/kmerDedup/kmerDedup.pl -k ${prefix} -mpr 0.3 -mcv 30 -kmer ${mer_len} -o kmerdedup_mpr3 -f ${prefix}.format.fa -bam ${prefix}.keymer_map.bam -cov ${prefix}.cov.stat -s samtools -t ${cpu} -mode 2 19 | cp kmerdedup_mpr3/${prefix}.dump.hash ./ 20 | 21 | ## plot 22 | sed '1d' kmerdedup_mpr3/${prefix}.all.xls | cut -f 1,2,7,8 | sort -k 3nr,3 | awk '{print NR","$3","$1","$2","$4}' | csvtk add-header -n num,cov,name,length,state > ${prefix}.all.deal.csv 23 | csvtk plot line --height 6 --width 20 -x 1 -y 2 ${prefix}.all.deal.csv > ${prefix}.all.deal.csv.png 24 | csvtk pretty ${prefix}.all.deal.csv > ${prefix}.all.deal.pretty.csv 25 | 26 | ### remove tmp 27 | #rm -rf ${prefix}.count.jf ${prefix}.dump.all ${prefix}.filt.fa split mapping shell *bt2 ${prefix}.keymer_map.bam 28 | -------------------------------------------------------------------------------- /other/check_pid_info.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #################################################### 3 | 4 | read -p "请输入要查询的进程名或PID: " NAME_PID 5 | 6 | # 判断是否有输入值 7 | if [ -z "${NAME_PID}" ]; then 8 | echo "请输入一个有效的进程名或PID!!" 9 | exit 1 10 | fi 11 | 12 | # 检查输入是否为数字,如果不是则认为是进程名 13 | if [[ "${NAME_PID}" =~ ^[0-9]+$ ]]; then 14 | PIDS=(${NAME_PID}) 15 | else 16 | PIDS=($(pgrep -x "${NAME_PID}")) 17 | if [ -z "$PIDS" ]; then 18 | echo "该进程名不存在!!" 19 | exit 1 20 | fi 21 | fi 22 | 23 | for PID in "${PIDS[@]}"; do 24 | # 检查进程是否存在 25 | if ! ps -p $PID &> /dev/null ; then 26 | echo "该PID不存在!!" 27 | continue 28 | fi 29 | 30 | # 获取并显示进程的基本信息 31 | echo "------------------------------------------------" 32 | printf "%-20s %s\n" "进程PID:" "$PID" 33 | printf "%-20s %s\n" "进程命令:" "$(ps -p $PID -o cmd=)" 34 | printf "%-20s %s%%\n" "CPU占用率:" "$(ps -p $PID -o %cpu=)" 35 | printf "%-20s %s%%\n" "内存占用率:" "$(ps -p $PID -o %mem=)" 36 | printf "%-20s %s\n" "进程所属用户:" "$(ps -p $PID -o user=)" 37 | printf "%-20s %s\n" "进程当前状态:" "$(ps -p $PID -o stat=)" 38 | printf "%-20s %.2f MB\n" "进程虚拟内存:" "$(echo "$(ps -p $PID -o vsz=) / 1024" | bc -l)" 39 | printf "%-20s %.2f MB\n" "进程共享内存:" "$(echo "$(ps -p $PID -o rss=) / 1024" | bc -l)" 40 | printf "%-20s %s\n" "进程运行持续时间:" "$(ps -p $PID -o etime=)" 41 | printf "%-20s %s\n" "进程开始运行时间:" "$(ps -p $PID -o lstart=)" 42 | echo "------------------------------------------------" 43 | done 44 | -------------------------------------------------------------------------------- /genome/Anno_EGAPx/fix_phase.py: -------------------------------------------------------------------------------- 1 | #!/dellfsqd2/ST_OCEAN/USER/lishuo11/01_soft/mambaforge/bin/python3 2 | 3 | import sys 4 | gff = sys.argv[1] 5 | 6 | with open(gff,'r') as infile: 7 | with open('mrna_'+gff,'w') as outfile: 8 | ls_gene = [] 9 | for line in infile: 10 | chrom = line.split('\t')[0] 11 | sam = line.split('\t')[1] 12 | id = line.split('\t')[5] 13 | info = line.split('\t')[8] 14 | phase = line.split('\t')[7] 15 | region = line.split('\t')[2] 16 | pos_ne = line.split('\t')[6] 17 | start = line.split('\t')[3] 18 | end = line.split('\t')[4] 19 | if region == 'mRNA': 20 | min = 'NA' 21 | max = 'NA' 22 | for ls in ls_gene: 23 | if ls[2] == 'CDS': 24 | if min == 'NA' or int(ls[3]) < int(min): 25 | min = ls[3] 26 | if max == 'NA' or int(ls[4]) > int(max): 27 | max = ls[4] 28 | for ls in ls_gene: 29 | if ls[2] == 'mRNA': 30 | ls[3] = min 31 | ls[4] = max 32 | outfile.write('\t'.join(ls)) 33 | ls_gene = [] 34 | ls = [chrom, sam, region, start, end, id, pos_ne, phase, info] 35 | ls_gene.append(ls) 36 | 37 | min = 'NA' 38 | max = 'NA' 39 | for ls in ls_gene: 40 | if ls[2] == 'CDS': 41 | if min == 'NA' or int(ls[3]) < int(min): 42 | min = ls[3] 43 | if max == 'NA' or int(ls[4]) > int(max): 44 | max = ls[4] 45 | for ls in ls_gene: 46 | if ls[2] == 'mRNA': 47 | ls[3] = min 48 | ls[4] = max 49 | outfile.write('\t'.join(ls)) 50 | outfile.close() 51 | infile.close() 52 | 53 | -------------------------------------------------------------------------------- /deal_gff/pick_longest_gene/fix_phase.py: -------------------------------------------------------------------------------- 1 | #!/dellfsqd2/ST_OCEAN/USER/lishuo11/01_soft/mambaforge/bin/python3 2 | 3 | import sys 4 | gff = sys.argv[1] 5 | 6 | with open(gff,'r') as infile: 7 | with open('mrna_'+gff,'w') as outfile: 8 | ls_gene = [] 9 | for line in infile: 10 | chrom = line.split('\t')[0] 11 | sam = line.split('\t')[1] 12 | id = line.split('\t')[5] 13 | info = line.split('\t')[8] 14 | phase = line.split('\t')[7] 15 | region = line.split('\t')[2] 16 | pos_ne = line.split('\t')[6] 17 | start = line.split('\t')[3] 18 | end = line.split('\t')[4] 19 | if region == 'mRNA': 20 | min = 'NA' 21 | max = 'NA' 22 | for ls in ls_gene: 23 | if ls[2] == 'CDS': 24 | if min == 'NA' or int(ls[3]) < int(min): 25 | min = ls[3] 26 | if max == 'NA' or int(ls[4]) > int(max): 27 | max = ls[4] 28 | for ls in ls_gene: 29 | if ls[2] == 'mRNA': 30 | ls[3] = min 31 | ls[4] = max 32 | outfile.write('\t'.join(ls)) 33 | ls_gene = [] 34 | ls = [chrom, sam, region, start, end, id, pos_ne, phase, info] 35 | ls_gene.append(ls) 36 | 37 | min = 'NA' 38 | max = 'NA' 39 | for ls in ls_gene: 40 | if ls[2] == 'CDS': 41 | if min == 'NA' or int(ls[3]) < int(min): 42 | min = ls[3] 43 | if max == 'NA' or int(ls[4]) > int(max): 44 | max = ls[4] 45 | for ls in ls_gene: 46 | if ls[2] == 'mRNA': 47 | ls[3] = min 48 | ls[4] = max 49 | outfile.write('\t'.join(ls)) 50 | outfile.close() 51 | infile.close() 52 | 53 | -------------------------------------------------------------------------------- /genome/Hic/haphic/re_draw.sh: -------------------------------------------------------------------------------- 1 | ### https://github.com/zengxiaofei/HapHiC 2 | 3 | micromamba activate haphic 4 | 5 | export PATH="/01_soft/HapHiC/:$PATH" 6 | export PATH="/01_soft/HapHiC/utils/:$PATH" 7 | 8 | agp=chr.fa.agp 9 | matrix=contact_matrix.pkl 10 | cpu=30 11 | bin_size=1000 ## bin size for generating contact matrix, default: 500 (kbp) 12 | cmap=viridis ## define the colormap for the heatmap, default: white,red. It can be any built-in sequential colormap from Matplotlib (refer to: 13 | ## https://matplotlib.org/stable/users/explain/colors/colormaps.html). You can create a custom colormap by listing colors separated by commas 14 | normalization=KR ## method for matrix normalization, default: KR {KR,log10,none} 15 | ncols=5 ## number of scaffolds per row in `separate_plots.pdf`, default: 5 16 | origin=top_left ## set the origin of each heatmap, default: bottom_left {bottom_left,top_left,bottom_right,top_right} 17 | border_style=outline ## border style for scaffolds, default: grid {grid,outline} 18 | figure_width=15 ## figure width, default: 15 (cm) 19 | figure_height=12 ## figure height, default: 12 (cm) 20 | # specified_scaffolds= 21 | # min_len= 22 | # vmax_coef= 23 | # manual_vmax= 24 | # separate_plots ## generate `separate_plots.pdf`, depicting the heatmap for each scaffold individually, default: False 25 | 26 | haphic plot ${agp} ${matrix} --bin_size ${bin_size} --cmap ${cmap} --normalization ${normalization} --ncols ${ncols} --origin ${origin} --border_style ${border_style} --figure_width ${figure_width} --figure_height ${figure_height} --threads ${cpu} 27 | -------------------------------------------------------------------------------- /genome/Hic/haphic/draw.sh: -------------------------------------------------------------------------------- 1 | ### https://github.com/zengxiaofei/HapHiC 2 | 3 | micromamba activate haphic 4 | 5 | export PATH="/01_soft/HapHiC/:$PATH" 6 | export PATH="/01_soft/HapHiC/utils/:$PATH" 7 | 8 | agp=chr.fa.agp 9 | bam=HiC.filtered.bam 10 | cpu=30 11 | bin_size=1000 ## bin size for generating contact matrix, default: 500 (kbp) 12 | cmap=viridis ## define the colormap for the heatmap, default: white,red. It can be any built-in sequential colormap from Matplotlib (refer to: 13 | ## https://matplotlib.org/stable/users/explain/colors/colormaps.html). You can create a custom colormap by listing colors separated by commas 14 | normalization=KR ## method for matrix normalization, default: KR {KR,log10,none} 15 | ncols=5 ## number of scaffolds per row in `separate_plots.pdf`, default: 5 16 | origin=top_right ## set the origin of each heatmap, default: bottom_left {bottom_left,top_left,bottom_right,top_right} 17 | border_style=outline ## border style for scaffolds, default: grid {grid,outline} 18 | figure_width=15 ## figure width, default: 15 (cm) 19 | figure_height=12 ## figure height, default: 12 (cm) 20 | # specified_scaffolds= 21 | # min_len= 22 | # vmax_coef= 23 | # manual_vmax= 24 | # separate_plots ## generate `separate_plots.pdf`, depicting the heatmap for each scaffold individually, default: False 25 | 26 | haphic plot ${agp} ${bam} --bin_size ${bin_size} --cmap ${cmap} --normalization ${normalization} --ncols ${ncols} --origin ${origin} --border_style ${border_style} --figure_width ${figure_width} --figure_height ${figure_height} --threads ${cpu} 27 | 28 | ## rm *bam 29 | -------------------------------------------------------------------------------- /Comparative_genomics/gene_family/06_rna_seq.sh: -------------------------------------------------------------------------------- 1 | seqkit shuffle ./00_data/cds.fa -o ./00_data/cds.fa.shuffle 2 | OLDIFS="$IFS" 3 | IFS=$'\n' 4 | for line in $(cat ./00_data/rna_data_list); do 5 | IFS="$OLDIFS" 6 | read -r name rna_seq_data1 rna_seq_data2 <<< "$line" 7 | fastp -i ./00_data/"$rna_seq_data1" -o ./3_rna_seq/01_fastp/"$rna_seq_data1" -I ./00_data/"$rna_seq_data2" -O ./3_rna_seq/01_fastp/"$rna_seq_data2" 8 | salmon index -t ./00_data/cds.fa.shuffle -i ./3_rna_seq/02_salmon/cds.fa.index -p 20 -k 31 9 | salmon quant -i ./3_rna_seq/02_salmon/cds.fa.index --validateMappings -l A -p 8 -1 ./3_rna_seq/01_fastp/"$rna_seq_data1" -2 ./3_rna_seq/01_fastp/"$rna_seq_data2" -o ./3_rna_seq/02_salmon/"$rna_seq_data1".quant 10 | grep -f ./1_identify_gene_family/04_final_gene_family/final_gene_id ./3_rna_seq/02_salmon/"$rna_seq_data1".quant/quant.sf |sort > ./3_rna_seq/03_gene_family.quant/"$rna_seq_data1".quant 11 | echo "$name" > ./3_rna_seq/04_visualization/"$name".sf.TPM ; awk '{print $4}' ./3_rna_seq/03_gene_family.quant/"$rna_seq_data1".quant >> ./3_rna_seq/04_visualization/"$name".sf.TPM 12 | IFS=$'\n' 13 | done 14 | IFS="$OLDIFS" 15 | echo 'gene name' > ./3_rna_seq/04_visualization/gene_name ; awk '{print $1}' ./1_identify_gene_family/04_final_gene_family/final_gene_id >> ./3_rna_seq/04_visualization/gene_name 16 | find ./3_rna_seq/04_visualization/ -name '*.sf.TPM' |paste -sd ' ' > ./3_rna_seq/04_visualization/TPM_file_path 17 | paste ./3_rna_seq/04_visualization/gene_name $(cat ./3_rna_seq/04_visualization/TPM_file_path) > ./3_rna_seq/04_visualization//final_matrix -------------------------------------------------------------------------------- /genome/evaluate_genome_size/evaluate_genome_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set +o posix 3 | 4 | prefix=yu 5 | NGS_fq1=ABCD_1.clean.fq.gz 6 | NGS_fq2=ABCD_2.clean.fq.gz 7 | mer_len=19 8 | cpu=30 9 | ploidy=2 10 | read_len=150 11 | 12 | ## count k-mers 13 | jellyfish count -m ${mer_len} -s 1000000000 -t ${cpu} -C -o ${prefix}.count.jf <(pigz -p 5 -d -c ${NGS_fq1}) <(pigz -p 5 -d -c ${NGS_fq2}) 14 | #jellyfish count -m ${mer_len} -s 1000000000 -t ${cpu} -C -o ${prefix}.count *.fq 15 | 16 | ## if jellyfish gives you more than one count file, you need merge them firs 17 | # jellyfish merge -v -o ${prefix}.count.jf ${prefix}.count_* 18 | 19 | jellyfish stats -o ${prefix}.stats ${prefix}.count.jf 20 | jellyfish histo -t ${cpu} ${prefix}.count.jf > ${prefix}.histo 21 | 22 | < ${prefix}.histo.tsv 34 | Genomeye -k ${mer_len} ${prefix}.histo.tsv > Genomeye.result 35 | ## check kmernum: cat ${prefix}.stats 36 | ## kmernum: 118233500882 expected_depth_for_unique_kmer: 76 37 | gce -g 118233500882 -f ${prefix}.histo.tsv -m 1 -D 1 -c 76 >O.gce.table 2>O.gce.log 38 | tail O.gce.log 39 | another_soft2 40 | -------------------------------------------------------------------------------- /deal_fasta/fa2phy/fa2phy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | import sys 3 | # usage 4 | USAGE = "\nusage: python2 fa2phy.py [input fasta file] [output phy file]\n" 5 | def parseFasta(filename): 6 | fas = {} 7 | id = None 8 | with open(filename, 'r') as fh: 9 | for line in fh: 10 | if line[0] == '>': 11 | header = line[1:].rstrip() 12 | id = header.split()[0] 13 | fas[id] = [] 14 | else: 15 | fas[id].append(line.rstrip()) 16 | for id, seq in fas.iteritems(): 17 | fas[id] = ''.join(seq) 18 | return fas 19 | if len(sys.argv) !=3: 20 | print USAGE 21 | sys.exit() 22 | fas = parseFasta(sys.argv[1]) 23 | outfile = sys.argv[2] 24 | sequence_list = [] # To keep order of sequence 25 | sequence_dict = {} 26 | for rec in fas: 27 | sequence_list.append(rec) 28 | sequence_dict[rec] = fas[rec] 29 | # Test length of the alignment: 30 | alignment_length = 0 31 | for gene in sequence_dict: 32 | if (alignment_length != 0) and (len(sequence_dict[gene]) != alignment_length): 33 | print "Error in alignment length, exit on error !!!" 34 | sys.exit() 35 | else: 36 | alignment_length = len(sequence_dict[gene]) 37 | number_of_seq = len(sequence_dict) 38 | longest_id = sorted(sequence_dict.keys(), key = lambda k: len(k))[-1] 39 | # Write alignment in Phylip format 40 | phyfile = open(outfile, "w") 41 | phyfile.write(str(number_of_seq)+" "+str(alignment_length)+"\n") 42 | for gene in sequence_list: 43 | phyfile.write(gene.ljust(len(longest_id), ' ') + " " + sequence_dict[gene] + "\n") 44 | phyfile.close() 45 | -------------------------------------------------------------------------------- /genome/assess/GCI_pb.sh: -------------------------------------------------------------------------------- 1 | ##https://github.com/yeeus/GCI 2 | 3 | genome=curated.fasta 4 | threads=70 5 | pb_ccs_fq=ccs.fq.gz 6 | kmer=17 7 | 8 | ## Map HiFi and/or ONT reads to assemblies minimap2 9 | minimap2 -t $threads -ax map-hifi $genome $pb_ccs_fq > align.sam ## mapping ONT reads with -ax map-ont 10 | samtools view -@ $threads -Sb align.sam | samtools sort -@ $threads -o align.bam 11 | samtools index align.bam 12 | rm align.sam 13 | 14 | ## winnowmap 15 | /01_soft/meryl-1.4.1/bin/meryl count k=${kmer} output merylDB ${genome} 16 | /01_soft/meryl-1.4.1/bin/meryl print greater-than distinct=0.9998 merylDB > mat_repetitive_k${kmer}.txt 17 | /01_soft/Winnowmap/bin/winnowmap -k ${kmer} -W mat_repetitive_k${kmer}.txt -ax map-pb $genome $pb_ccs_fq > align2.sam 18 | samtools view -@ $threads -Sb align2.sam | samtools sort -@ $threads -o align2.bam 19 | samtools index align2.bam 20 | rm align2.sam 21 | 22 | ## mapquik 23 | /usr/bin/singularity exec --bind $PWD:$PWD mapquik.sif mapquik --low-memory --parallelfastx --threads ${threads} -p mapquik -k ${kmer} -d ${mapquik_density} -l ${mapquik_lmer} ${pb_ccs_fq} --reference ${genome} 24 | 25 | ## veritymap 26 | python /01_soft/VerityMap/veritymap/main.py --reads ${pb_ccs_fq} -d hifi-diploid -o veritymap ${genome} -t ${threads} 27 | 28 | # We recommend to input one bam and one paf file produced by two softwares (for example, one bam file from winnowmap and one paf file from minimap2) 29 | # PDF is recommended because PNG file may lose some details though GCI will output png files by default 30 | 31 | ## !!!! select by yourself !!!! 32 | /01_soft/GCI/GCI.py -r ${genome} --hifi align.bam align2.bam -t $threads -p -it pdf 33 | -------------------------------------------------------------------------------- /genome/TE/HiTEv3.2.sh: -------------------------------------------------------------------------------- 1 | ## https://github.com/CSU-KangHu/HiTE#cmd 2 | 3 | genome=$PWD/Anneissia_japonica.fa ### absolute paths 4 | out_dir=$PWD/Hite_out ### absolute paths 5 | cpu=10 6 | curated_lib=/06_database/fish_te/animal_fish.rmdup.lib ## Provide a fully trusted curated library, which will be used to pre-mask highly homologous sequences in the genome. 7 | isplant=1 ## Is it a plant genome, 1: true, 0: false. 8 | isremove_nested=1 ## Whether to remove nested TE, 1: true, 0: false. 9 | isrecover=1 ## whether to enable recovery mode to avoid starting from the beginning, 1: true, 0: false. 10 | isdomain=1 ## Whether to obtain TE domains, HiTE uses RepeatPeps.lib from RepeatMasker to obtain TE domains, 1: true, 0: false. 11 | isannotate=1 ## Whether to annotate the genome using the TE library generated, 1: true, 0: false. 12 | isintact_anno=1 ## Whether to generate annotation of full-length TEs, 1: true, 0: false. 13 | isBM_RM2=1 ## Whether to conduct benchmarking of RepeatModeler2, 1: true, 0: false. 14 | isBM_HiTE=1 ## Whether to conduct benchmarking of HiTE, 1: true, 0: false. 15 | 16 | ## for help 17 | # /usr/bin/singularity exec /01_soft/singularity_all/HiTE.sif python /01_soft/HiTEv3.2/main.py -h 18 | 19 | /usr/bin/singularity exec /01_soft/singularity_all/HiTE.sif \ 20 | python /01_soft/HiTEv3.2/main.py \ 21 | --genome ${genome} \ 22 | --thread ${cpu} \ 23 | --outdir ${out_dir} \ 24 | --chunk_size 200 \ 25 | --plant ${isplant} \ 26 | --remove_nested ${isremove_nested} \ 27 | --domain ${isdomain} \ 28 | --recover ${isrecover} \ 29 | --annotate ${isannotate} \ 30 | --intact_anno ${isintact_anno} \ 31 | --BM_RM2 ${isBM_RM2} \ 32 | --BM_HiTE ${isBM_HiTE} \ 33 | --curated_lib ${curated_lib} 34 | -------------------------------------------------------------------------------- /genome/puerge_halp/kmerdup/step1_prepare.sh: -------------------------------------------------------------------------------- 1 | export PATH="/01_soft/kmerDedup/:$PATH" 2 | 3 | prefix=YZ 4 | genome=curated.fasta 5 | mer_len=19 6 | cpu=70 7 | counter_len=8 8 | NGS_fq1=YZ_clean.R1.fq.gz 9 | NGS_fq2=YZ_clean.R2.fq.gz 10 | ccs_fa=yz.fasta.gz 11 | 12 | ## step1 count k-mers 13 | jellyfish count -m ${mer_len} -s 100M -t ${cpu} -c ${counter_len} -C -o ${prefix}.count <(pigz -p 5 -d -c ${NGS_fq1}) <(pigz -p 5 -d -c ${NGS_fq2}) <(zcat ${ccs_fa}) 14 | 15 | ## if jellyfish gives you more than one count file, you need merge them firs 16 | # jellyfish merge -v -o ${prefix}.count.jf ${prefix}.count_* 17 | 18 | mv ${prefix}.count ${prefix}.count.jf 19 | 20 | ## step2 stat and histo (you can skip) 21 | jellyfish stats -o ${prefix}.stats ${prefix}.count.jf 22 | jellyfish histo -t ${cpu} ${prefix}.count.jf | perl -lane 'my ($dpt, $cnt) = split(/\s+/, $_); my $nn = $dpt * $cnt;print "$dpt\t$cnt\t$nn"' > ${prefix}.histo 23 | 24 | ## plot, not work, have some problem 25 | # Rscript /01_software/genomescope/genomescope2.0/genomescope.R -i ${prefix}.histo -o ${prefix} -k ${mer_len} -n ${prefix}.model_2 26 | 27 | ## step3 dump k-mers 28 | jellyfish dump -c -t -o ${prefix}.dump.all ${prefix}.count.jf 29 | perl /01_soft/kmerDedup/kmerDedup/kmerFilter.pl -d ${prefix}.dump.all -o ${prefix}.filt.fa -l 3 -u 100000 30 | perl /01_soft/kmerDedup/kmerDedup/splitFasta.pl -f ${prefix}.filt.fa -o split -k ${prefix}.kmer 31 | 32 | ## step4 mapping k-mers 33 | perl /01_soft/kmerDedup/kmerDedup/fa2fa.pl -f ${genome} -o ${prefix}.format.fa -c F -n F -l 1000 34 | 35 | bowtie2-build --threads ${cpu} ${prefix}.format.fa ${prefix}.format 36 | ls split/ > split.id 37 | [ -d shell ] || mkdir shell 38 | 39 | for i in $(cat split.id); do sed "s/ABCD/${i}/g" bowtie2_demo.sh > shell/bowtie2_${i}.sh; done 40 | -------------------------------------------------------------------------------- /Comparative_genomics/gene_cluster/Galeon.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ## https://github.com/molevol-ub/galeon 3 | # micromamba activate Galeon 4 | source activate /micromamba/envs/Galeon 5 | 6 | export PATH=/01_soft/galeon/GALEON_masterScripts:$PATH 7 | export PATH=/mambaforge/envs/R-4.2/bin/:$PATH 8 | 9 | genome=fcs.fa 10 | EVM=fcs.EVM.bgi.filter.gff 11 | ID=dom.select.id 12 | famliy_name=FRED 13 | GVALUE=100 ## ## 软件估计在多个碱基中发现的预期基因数量,以及在g输入值中预期的基因数量,并估计在 GVALUE 大小(即:GVALUE Kb)的窗口中发现2个或更多基因的概率,在以下分析中,这些基因将被视为一个簇. 14 | cpu=48 15 | 16 | fishInWinter.pl ${ID} ${EVM} > ${ID}.gff 17 | gffread ${ID}.gff -g ${genome} -x ${ID}.gff.cds -y ${ID}.gff.pep 18 | 19 | [ -d GFFs ] || mkdir GFFs 20 | [ -d Proteins ] || mkdir Proteins 21 | 22 | awk '$3=="mRNA"' ${ID}.gff | cut -f 1,4,5,9 | sed 's/ID=//g;s/;//g' > GFFs/${famliy_name}_fam.bed2 23 | sed "s/U$//g" ${ID}.gff.pep > Proteins/${famliy_name}_fam.fasta 24 | 25 | mafft --auto --thread ${cpu} ${ID}.gff.pep > Proteins/${famliy_name}_fam.aln 26 | 27 | GALEON_ControlScript.py clusterfinder -a GFFs/ -p Proteins/ -e enabled -pm True -F WithinFamilies -g ${GVALUE} -emx_pos Lower -c two -t FastTree -f orange -outdir cluster_${famliy_name} -log Log_dir 28 | 29 | GALEON_GetEvoStats.py -clust cluster_${famliy_name} -prot Proteins/ -coords GFFs 30 | 31 | perl /dellfsqd2/ST_OCEAN/USER/lishuo11/01_soft/galeon/GALEON_masterScripts/Scripts/Get_scaffold_length.pl ${genome} 32 | 33 | ## -sfilter ALL|NUM|FILE The summary plots will represent the results for a "NUM" number of largest scaffolds; a list of scaffolds of interest provided as a single column in an input "FILE"; "ALL" the scaffolds (often too many, the resulting summary plots might not informative). 34 | 35 | GALEON_SummaryFiles.py -fam ${famliy_name} -clust cluster_${famliy_name} -coords GFFs -ssize ChrSizes.txt -sfilter 10 36 | -------------------------------------------------------------------------------- /genome/Anno_denovo/helixer.sh: -------------------------------------------------------------------------------- 1 | ## https://github.com/weberlab-hhu/Helixer 2 | 3 | species=hcs 4 | genome=$PWD/HCS_chr.fa 5 | out_gff=$PWD/helixer.predict.gff 6 | lineage=invertebrate # {vertebrate,land_plant,fungi,invertebrate} 7 | model=/Helixer_models/models/invertebrate/invertebrate_v0.3_m_0100.h5 8 | # fungi: /Helixer_models/models/fungi/fungi_v0.3_a_0100.h5 9 | # invertebrate: /Helixer_models/models/invertebrate/invertebrate_v0.3_m_0100.h5 10 | # land_plant: /Helixer_models/models/land_plant/land_plant_v0.3_a_0080.h5 11 | # vertebrate: /Helixer_models/models/vertebrate/vertebrate_v0.3_m_0080.h5 12 | batch_size=8 # large batch_size means needing more GPU memory. {5,6} may ok on one GPU card. 13 | TMP=$PWD/TMP 14 | 15 | ### get help 16 | # /usr/bin/singularity exec --bind $PWD/:$PWD/ helixer.sif Helixer.py -h 17 | 18 | ### download best model to !!!!!! your homedir .local/share/Helixer !!!!! 19 | #/usr/bin/singularity exec --bind $PWD/:$PWD/ helixer.sif fetch_helixer_models.py 20 | 21 | ### main script. "--nv" mean support GPU, cpu is also ok. 22 | [ -d TMP ] || mkdir TMP 23 | /usr/bin/singularity run --bind $PWD/:$PWD/ --nv sif/helixer.sif Helixer.py \ 24 | --subsequence-length 213840 \ 25 | --overlap-offset 106920 \ 26 | --overlap-core-length 160380 \ 27 | --batch-size ${batch_size} \ 28 | --lineage ${lineage} \ 29 | --temporary-dir ${TMP} \ 30 | --species ${species} \ 31 | --model-filepath ${model} \ 32 | --fasta-path ${genome} \ 33 | --gff-output-path ${out_gff} 34 | 35 | grep -v "#" ${out_gff} | awk '$3=="mRNA" || $3=="CDS"' | awk -F "[;\t]" '{if ($3~/mRNA/) print $0"\t"$9";"; else print $0"\t"$10";" }' | cut -f 1-8,10 > tmp.gff 36 | fix_mRNA_coordinate.pl tmp.gff helixer.bgi.gff 37 | gffread helixer.bgi.gff -g ${genome} -x helixer.bgi.gff.cds -y helixer.bgi.gff.pep 38 | rm -rf tmp.gff ${TMP} 39 | -------------------------------------------------------------------------------- /picture/synteny_circos/circos_sys.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set +o posix 3 | set -eo pipefail 4 | 5 | export PATH="/01_software/lastal/bin/:$PATH" 6 | export PATH="/01_software/latex/bin/x86_64-linux/:$PATH" 7 | python=/01_soft/mamba/env/jcvi/bin/python 8 | 9 | ## use ${sp1}.bed ${sp2}.bed ${sp1}.pep ${sp2}.pep ${sp1}.genome.fa ${sp2}.genome.fa 10 | sp1=O_curv 11 | sp2=O_mela 12 | 13 | ### dotplot 14 | ${python} -m jcvi.compara.catalog ortholog --dbtype prot --cpus=1 --no_strip_names ${sp1} ${sp2} 15 | 16 | ### make .simple 17 | ${python} -m jcvi.compara.synteny screen --simple ${sp1}.${sp2}.anchors ${sp1}.${sp2}.anchors.new 18 | simpletolink.py ${sp1}.${sp2}.anchors.simple 19 | 20 | ### use ${sp1}.genome.fa ${sp2}.genome.fa 21 | iTools Fatools stat -InPut ${sp1}.genome.fa -OutPut ${sp1}.genome.fa.chrlist 22 | iTools Fatools stat -InPut ${sp2}.genome.fa -OutPut ${sp2}.genome.fa.chrlist 23 | 24 | grep -v "#" ${sp1}.genome.fa.chrlist | cut -f 1,2 | fishInWinter.pl -bf table -ff table <( cut -f 1 ${sp1}.bed | awk '!a[$0]++' ) - | awk '{print "chr - " "'"$sp1"'" "_" $1 " " "'"$sp1"'" "_" $1 " 0 " $2 " " "chr"NR}' > karyotype_sp1.txt 25 | grep -v "#" ${sp2}.genome.fa.chrlist | cut -f 1,2 | fishInWinter.pl -bf table -ff table <( cut -f 1 ${sp2}.bed | awk '!a[$0]++' ) - | awk '{print "chr - " "'"$sp2"'" "_" $1 " " "'"$sp2"'" "_" $1 " 0 " $2 " " "chr"NR}' > karyotype_sp2.txt 26 | 27 | awk '{print "'"$sp1"'" "_" $1 "\t" $2 "\t" $3 "\t" "'"$sp2"'" "_" $4 "\t" $5 "\t" $6}' ${sp1}.${sp2}.anchors.simple_link.txt > anchors.simple_link.rename.txt 28 | 29 | generate_circos_configs.py anchors.simple_link.rename.txt 30 | cp circos_config/* . 31 | /usr/bin/singularity exec --bind $PWD:$PWD /01_soft/singularity_all/circos.sif circos -conf circos_config_output/circos.conf 32 | 33 | rm *.ssp *.tis *.sds *.des *.prj *.suf *.bck *.chrlist 34 | -------------------------------------------------------------------------------- /genome/assess/compleasm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ## https://github.com/huangnengCSU/compleasm 3 | 4 | ### Important parameters 5 | input_genome=genome.fa 6 | cpu=48 7 | database_prefix=actinopterygii 8 | min_bestScore=0.95 # output if score at least FLOAT*bestScore [0.95] 9 | database_path= 10 | compleasm=~/01_software/compleasm/compleasm.py 11 | 12 | ### Threshold parameters # Tips: A threshold that is too low may result in falsely high results 13 | 14 | min_diff=0.2 # The thresholds for the best matching and second best matching. default=0.2 15 | min_identity=0.6 # The identity threshold for valid mapping results. default=0.4 16 | min_length_percent=0.6 # The fraction of protein for valid mapping results. default=0.6 17 | min_complete=0.9 # The length threshold for complete gene. default=0.9 18 | 19 | ### other parameters 20 | output_dir=00_assessment 21 | mode=busco #lite or busco 22 | #lite: Without using hmmsearch to filtering protein alignment. 23 | #busco: Using hmmsearch on all candidate predicted proteins to purify the miniprot alignment to improve accuracy. 24 | 25 | ### CMD 26 | /usr/bin/singularity exec compleasm.sif python ${compleasm} run -a ${input_genome} -o ${output_dir} -l ${database_prefix} -t ${cpu} -m ${mode} --outs ${min_bestScore} -L ${database_path} --min_diff ${min_diff} --min_identity ${min_identity} --min_length_percent ${min_length_percent} --min_complete ${min_complete} 27 | 28 | rm -rf ${output_dir}/*_odb10/*.done ${output_dir}/*_odb10/hmmer_output 29 | pigz --best ${output_dir}/*_odb10/miniprot_output.gff 30 | pigz --best ${output_dir}/*_odb10/translated_protein.fasta 31 | pigz --best ${output_dir}/*_odb10/gene_marker.fasta 32 | 33 | ### for more help 34 | ## /usr/bin/singularity exec compleasm.sif python ${compleasm} -h 35 | -------------------------------------------------------------------------------- /deal_gff/pick_longest_gene/deal.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set +o posix 3 | set -eo pipefail 4 | 5 | species=Branchiostoma_floridae 6 | genome=GCF_000003815.2_Bfl_VNyyK_genomic.fna 7 | pep=GCF_000003815.2_Bfl_VNyyK_protein.faa 8 | gff=GCF_000003815.2_Bfl_VNyyK_genomic.gff 9 | 10 | pick_longest_ncbi.pl ${gff} 11 | 12 | awk -F "Dbxref" '{print $1}' clean.gff | sed 's/rna-//g;s/cds-//g;s/gene-//g' | awk '$3=="CDS"' | cut -f 9 | awk '!a[$0]++' | sed 's/ID=//g;s/;Parent=/\t/g;s/;//g' | awk -F "\t" 'NR==FNR{a[$1]=$2}NR!=FNR{print $0"\t"a[$1]}' <(grep '>' ${pep} | sed 's/>//g' | awk -F "[" '{print $1}' | sed 's/ $//g' | sed 's/ /\t/1') - | awk '{print $2"\t"$0}' | awk -F "\t" 'NR==FNR{a[$1]=$2}NR!=FNR{print $0"\t"a[$1]}' <(awk -F "Dbxref" '{print $1}' clean.gff | sed 's/rna-//g;s/cds-//g;s/gene-//g' | awk '$3=="mRNA"' | cut -f 9 | sed 's/ID=//g;s/;Parent=/\t/g;s/;//g') - | cut -f 2- | awk -F "\t" '{print $1"\t"$2"\t"$4"\t"$3}' > ${species}.ncbi.anno 13 | 14 | awk -F "Dbxref" '{print $1}' clean.gff | sed 's/rna-//g;s/cds-//g;s/gene-//g' | awk -F ";" '{print $1";"}' | awk '{print $9"\t"$0}' | sed 's/ID=//1;s/;//1' | awk -F "\t" 'NR==FNR{a[$1]=$2}NR!=FNR{print $0"\t"a[$1]}' <(awk -F "\t" '{print $2"\t"$1}' ${species}.ncbi.anno | sed 's/\t/\tID=/g' ) - | cut -f 2- | sed 's/ID=/Parent=/1;s/\t$//g' | awk -F "\t" '{if ($3~/mRNA/) print $0";"; else print $0"\t"$9}' | cut -f 1-8,10 > clean.deal.gff 15 | 16 | fix_mRNA_coordinate.pl clean.deal.gff clean.deal.fix.gff 17 | 18 | gff3_sort -g clean.deal.fix.gff | grep -v "#" > ${species}.bgi.gff 19 | 20 | gffread ${species}.bgi.gff -g ${genome} -x ${species}.bgi.gff.cds 21 | seqkit translate --trim --clean ${species}.bgi.gff.cds > ${species}.bgi.gff.pep 22 | grep ">" ${species}.bgi.gff.cds | awk '{print $1}' | sed "s/>//g" | seqtk subseq ${pep} - | awk '{print $1}' | seqkit seq -w 0 > ${species}.bgi.gff.pep2 23 | 24 | rm clean*gff 25 | -------------------------------------------------------------------------------- /genome/assess/LAI.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #https://github.com/oushujun/LTR_retriever 4 | #https://github.com/oushujun/LTR_FINDER_parallel/tree/v1.1 5 | #https://github.com/oushujun/LTR_HARVEST_parallel 6 | #https://www.jianshu.com/p/ed289822c825 7 | #https://github.com/wangziwei08/LTR-insertion-time-estimation 8 | #https://www.jianshu.com/p/f962d5c40fdf ### LTR_retriever 9 | 10 | genome=hcs.chr.genome.fa 11 | threads=20 12 | substitution_mutations_rate=1e-8 13 | 14 | ### software path 15 | gt_software=/01_software/gt-1.6.2-Linux_x86_64-64bit-complete/bin/gt 16 | finder=/01_software/LTR_FINDER_parallel/bin/LTR_FINDER.x86_64-1.0.7/ltr_finder 17 | LTR_FINDER_parallel=/01_software/LTR_FINDER_parallel/LTR_FINDER_parallel 18 | LTR_HARVEST_parallel=/01_software/LTR_HARVEST_parallel/LTR_HARVEST_parallel 19 | 20 | ## step1 LTR_HARVEST 21 | perl ${LTR_HARVEST_parallel} -seq ${genome} -gt ${gt_software} -threads ${threads} 22 | 23 | ## step2 LTR_FINDER 24 | perl ${LTR_FINDER_parallel} -seq ${genome} -harvest_out -finder ${finder} -t ${threads} 25 | 26 | ## step3 LTR_retriever 27 | source activate /01_soft/mamba/envs/LTR_retriever 28 | LTR_retriever -genome ${genome} -inharvest ${genome}.harvest.combine.scn -infinder ${genome}.finder.combine.scn -threads ${threads} -u ${substitution_mutations_rate} 29 | 30 | ## step4 prepare draw files 31 | sed '1d' ${genome}.pass.list | awk -F "[:\t]" '{print $1","$(NF-2)","$NF}' | sed 's/-0/0/g' > LTR_time.csv 32 | 33 | 34 | <"; 6 | my $out="clean.gff"; 7 | 8 | my %length; 9 | my %gene_cds; 10 | open I,"< $gff"; 11 | while () { 12 | next if(/^#/); 13 | my @a=split(/\s+/); 14 | my ($chr,$source,$type,$start,$end,$score,$strand,$phase,$info)=@a; 15 | if($type eq "CDS"){ 16 | $info=~/Parent=([^;]+)/; 17 | my $cds_id=$1; 18 | $length{$cds_id}+=$end-$start+1; 19 | } 20 | elsif($type eq "mRNA"){ 21 | $info=~/ID=([^;]+).*Parent=([^;]+)/; 22 | my ($cds_id,$gene_id)=($1,$2); 23 | $gene_cds{$gene_id}{$cds_id}=1; 24 | } 25 | } 26 | close I; 27 | 28 | foreach my $gene_id(keys %gene_cds){ 29 | foreach my $cds_id(keys %{$gene_cds{$gene_id}}){ 30 | if(exists $length{$cds_id}){ 31 | $gene_cds{$gene_id}{$cds_id}=$length{$cds_id}; 32 | } 33 | else { 34 | delete $gene_cds{$gene_id}{$cds_id}; 35 | } 36 | } 37 | } 38 | 39 | my %keep; 40 | foreach my $gene_id(sort keys %gene_cds){ 41 | my @protein_id=sort {$gene_cds{$gene_id}{$b} <=> $gene_cds{$gene_id}{$a}} keys %{$gene_cds{$gene_id}}; 42 | my $selected=$protein_id[0]; 43 | # $selected=~s/\.\d$//; 44 | $keep{$selected}=1; 45 | } 46 | 47 | open O,"> $out"; 48 | open I,"< $gff"; 49 | while () { 50 | chomp; 51 | next if(/^#/); 52 | my @a=split(/\s+/); 53 | my ($chr,$source,$type,$start,$end,$score,$strand,$phase,$info)=@a; 54 | if($type eq "CDS"){ 55 | $info=~/Parent=([^;]+)/; 56 | my $cds_id=$1; 57 | next unless($keep{$cds_id}); 58 | s/transcript://g; 59 | print O "$_\n"; 60 | } 61 | elsif($type eq "mRNA"){ 62 | $info=~/ID=([^;]+).*Parent=([^;]+)/; 63 | my ($cds_id,$gene_id)=($1,$2); 64 | next unless($keep{$cds_id}); 65 | s/transcript://g; 66 | print O "$_\n"; 67 | } 68 | } 69 | close I; 70 | close O; 71 | -------------------------------------------------------------------------------- /genome/Genome_error_correction/Pilon&racon.sh: -------------------------------------------------------------------------------- 1 | 对于三代pacbio初组装的基因组来说,三代测序有一定的错误率,组装出的基因组可能会有错误的区域,这时可以选择用所测的三代测序数据和二代测序数据进行基因组草图纠错。 2 | 其实在组装时我们也可以用二代和三代数据进行混合组装(等进一步测试后,我会进行补充,目前先不写) 3 | racon用于三代数据纠错,pilon则是运用二代数据纠错,顺便提一下 nextpoilsh 既可以用三代也可以用二代进行纠错,感觉二代纠错质量不如pilon,这里仁者见仁智者见智。 4 | 1、软件安装 5 | -------------------------------------------------- 6 | #这里推荐用mamba直接进行安装 7 | mamba install pilon 8 | mamba install racon 9 | #这里同样需要其他软件:minimap2 、bwa 这里自行安装 10 | -------------------------------------------------- 11 | 2、开始进行基因组纠错(每个纠错各需要三轮,先三代后二代) 12 | -------------------------------------------------- 13 | #三代第一轮纠错 14 | minimap2 -ax map-pb -t 24 assembly.fa pacbio.read.fasta | gzip -c - > minimap1.sam.gz 15 | racon -t 24 -u pacbio.read.fasta minimap1.sam.gz assembly.fa >racon1.fasta 16 | #三代第二轮纠错 17 | minimap2 -ax map-pb -t 24 racon1.fasta pacbio.read.fasta | gzip -c - > minimap2.sam.gz 18 | racon -t 24 -u pacbio.read.fasta minimap2.sam.gz racon1.fasta > racon2.fasta 19 | #三代第三轮纠错 20 | minimap2 -ax map-pb -t 24 racon2.fasta pacbio.read.fasta | gzip -c - > minimap3.sam.gz 21 | racon -t 24 -u pacbio.read.fasta minimap3.sam.gz racon2.fasta > racon3.fasta 22 | #二代第一轮纠错 23 | bwa index racon3.fasta 24 | bwa mem -t 24 racon3.fasta illmunia_R1.fq.gz illmunia_R2.fq.gz |samtools sort -@ 24 - -o bwamem1.bam 25 | pilon --genome racon3.fasta --frags bwamem1.bam --changes --diploid --outdir ./pilon.out --output pilon1 26 | #二代第二轮纠错 27 | bwa index ./pilon.out/pilon1.fasta 28 | bwa mem -t 24 ./pilon.out/pilon1.fasta illmunia_R1.fq.gz illmunia_R2.fq.gz |samtools sort -@ 24 - -o bwamem2.bam 29 | pilon --genome ./pilon.out/pilon1.fasta --frags bwamem2.bam --changes --diploid --outdir ./pilon.out --output pilon2 30 | #二代第三轮纠错 31 | bwa index ./pilon.out/pilon2.fasta 32 | bwa mem -t 24 ./pilon.out/pilon2.fasta illmunia_R1.fq.gz illmunia_R2.fq.gz |samtools sort -@ 24 - -o bwamem3.bam 33 | pilon --genome ./pilon.out/pilon2.fasta --frags bwamem3.bam --changes --diploid --outdir ./pilon.out --output pilon3 34 | -------------------------------------------------------- 35 | -------------------------------------------------------------------------------- /deal_fasta/six_frame_translate/translate_seq.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import OrderedDict 3 | from Bio import SeqIO 4 | from Bio.Data import CodonTable 5 | 6 | def six_frame_translate(inFa, fout=sys.stdout, seqfmt='fasta', transl_table=1): 7 | d_length = OrderedDict() 8 | for rc in SeqIO.parse(open(inFa), seqfmt): 9 | for seq, suffix0 in zip([rc.seq, rc.seq.reverse_complement()], ['aa', 'rev_aa']): 10 | for frame in range(0,3): 11 | nucl_seq = seq[frame:] 12 | try: aa_seq = translate_seq(nucl_seq, table=transl_table) 13 | except CodonTable.TranslationError: continue # Codon 'XGA' is invalid 14 | suffix = '|{}{}'.format(suffix0, frame+1) 15 | print('>{}{}\n{}'.format(rc.id, suffix, aa_seq), file=fout) 16 | d_length[rc.id] = len(rc.seq) 17 | return d_length 18 | def _six_frame_translate(rc, transl_table=1): 19 | for seq, suffix0 in zip([rc.seq, rc.seq.reverse_complement()], ['aa', 'rev_aa']): 20 | for frame in range(0,3): 21 | nucl_seq = seq[frame:] 22 | try: aa_seq = translate_seq(nucl_seq, table=transl_table) 23 | except CodonTable.TranslationError: continue # Codon 'XGA' is invalid 24 | suffix = '|{}{}'.format(suffix0, frame+1) 25 | yield rc.id, suffix, aa_seq 26 | 27 | def translate_seq(inSeq, **kargs): 28 | aa = inSeq.translate(**kargs) 29 | return aa 30 | def translate_cds(inSeq, transl_table=1, **kargs): 31 | for key in list(kargs.keys()): 32 | if not key in {'to_stop', 'stop_symbol', 'gap'}: 33 | del kargs[key] 34 | try: 35 | aa = translate_seq(inSeq, cds=True, table=transl_table, **kargs) 36 | except CodonTable.TranslationError as e: 37 | aa = translate_seq(inSeq, table=transl_table, **kargs) 38 | return aa 39 | 40 | def main(inFa, outSeq=sys.stdout): 41 | for rc in SeqIO.parse(open(inFa), 'fasta'): 42 | print('>{}\n{}'.format(rc.id, translate_seq(rc.seq)), file=outSeq) 43 | 44 | if __name__ == '__main__': 45 | import sys 46 | inFa = sys.argv[1] 47 | if inFa == 'six_frame_translate': 48 | inFa = sys.argv[2] 49 | six_frame_translate(inFa) 50 | else: 51 | main(inFa) 52 | (base) 53 | -------------------------------------------------------------------------------- /deal_gff/gff.simple/gff.simple.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | ## gff.simple.pl -gff in.gff > out.gff 3 | use strict; 4 | use Getopt::Long; 5 | my ($infile,$tag,$mrna); 6 | GetOptions( 7 | "gff:s"=>\$infile, 8 | "tag:s"=>\$tag, 9 | "mrna:s"=>\$mrna 10 | ); 11 | $tag ||= 'ID'; 12 | $mrna ||='CDS'; 13 | my %CDS; 14 | my %gid; 15 | open IN, "$infile" || die "$!\n"; 16 | while(){ 17 | chomp; 18 | next if (/^#/); 19 | next if (/^\s*$/); 20 | my @info=split(/\s+/, $_); 21 | if ($info[2]=~/$mrna/){ 22 | if ($info[8]=~/$tag=([^;]+)/){ 23 | # if ($info[9]=~/\"(\S+?)\";/){ 24 | #if (exists $gid{$1}){ 25 | # next; 26 | #}else{ 27 | # $gid{$1}++; 28 | #} 29 | my $key=$1; 30 | ($info[3],$info[4])=($info[4],$info[3]) if ($info[3]>$info[4]); 31 | # push @{$CDS{$key}}, [@info]; 32 | push @{$CDS{$key}}, [$info[0],$info[1],$info[2],$info[3],$info[4],$info[5],$info[6],$info[7],$info[9]]; 33 | } 34 | } 35 | } 36 | close IN; 37 | 38 | foreach my $id (sort keys %CDS){ 39 | # @{$CDS{$id}}=sort {$a->[3] <=> $b->[3]} @{$CDS{$id}}; 40 | # my $ms=$CDS{$id}[0][3]; 41 | # my $me=$CDS{$id}[-1][4]; 42 | # my $strand =$CDS{$id}[0][6]; 43 | # print "$CDS{$id}[0][0]\t$CDS{$id}[0][1]\tmRNA\t$ms\t$me\t\.\t$strand\t\.\tID=$id;\n"; 44 | my $newc=''; 45 | if ($CDS{$id}[0][6] eq '+'){ 46 | @{$CDS{$id}}=sort {$a->[3] <=> $b->[3]} @{$CDS{$id}}; 47 | #$CDS{$id}[-1][4] +=3; 48 | print "$CDS{$id}[0][0]\t$CDS{$id}[0][1]\tmRNA\t$CDS{$id}[0][3]\t $CDS{$id}[-1][4]\t\.\t$CDS{$id}[0][6]\t\.\tID=$id;\n"; 49 | for (my $i=0; $i<@{$CDS{$id}}; $i++){ 50 | $CDS{$id}[$i][8]="Parent=$id;"; 51 | $newc=join "\t", @{$CDS{$id}[$i]}; 52 | print "$newc\n"; 53 | } 54 | } 55 | if ($CDS{$id}[0][6] eq '-'){ 56 | @{$CDS{$id}}=reverse (sort {$a->[3] <=> $b->[3]} @{$CDS{$id}}); 57 | #$CDS{$id}[-1][3] -=3; 58 | print "$CDS{$id}[0][0]\t$CDS{$id}[0][1]\tmRNA\t$CDS{$id}[-1][3]\t $CDS{$id}[0][4]\t\.\t$CDS{$id}[0][6]\t\.\tID=$id;\n"; 59 | for (my $i=0; $i<@{$CDS{$id}}; $i++){ 60 | $CDS{$id}[$i][8]="Parent=$id;"; 61 | $newc=join "\t", @{$CDS{$id}[$i]}; 62 | print "$newc\n"; 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /genome/relernn/All.prediction.sh: -------------------------------------------------------------------------------- 1 | source activate /micromamba/envs/ReLERNN 2 | SIMULATE="ReLERNN_SIMULATE" 3 | SIMULATE="ReLERNN_SIMULATE" 4 | TRAIN="ReLERNN_TRAIN" 5 | PREDICT="ReLERNN_PREDICT" 6 | BSCORRECT="ReLERNN_BSCORRECT" 7 | SEED="42" 8 | MU="6e-9" 9 | GENTIME="1" 10 | URTR="1" 11 | DIR="./ABCD_output/" 12 | VCF="./ABCD.vcf" 13 | GENOME="./S_maxi.fa.fai.bed" 14 | CPU="10" 15 | Maxwinsize="500" 16 | Minsites="10" 17 | batchsize="10" 18 | #MASK="./accessibility_mask.bed" 19 | 20 | ## prepare 21 | #awk '$2>5000000' S_maxi.fa.fai | awk '{print $1"\t0\t"$2}' > S_maxi.fa.fai.bed 22 | bcftools view -S ABCD -m 2 -M 2 S_maxi.SNP.filter.vcf.gz | vcftools --vcf - --maf 0.05 --max-maf 0.95 --max-missing 0.1 --stdout --recode | bcftools annotate --remove QUAL,FILTER,INFO,^FORMAT/GT | grep -v contig > ABCD.vcf 23 | 24 | # Simulate data 25 | ${SIMULATE} \ 26 | --vcf ${VCF} \ 27 | --genome ${GENOME} \ 28 | --projectDir ${DIR} \ 29 | --assumedMu ${MU} \ 30 | --upperRhoThetaRatio ${URTR} \ 31 | --nTrain 13000 \ 32 | --nVali 2000 \ 33 | --nTest 100 \ 34 | --forceDiploid \ 35 | --maxSites ${Maxwinsize} \ 36 | -t ${CPU} \ 37 | --seed ${SEED} 38 | 39 | # Train network 40 | ${TRAIN} \ 41 | --projectDir ${DIR} \ 42 | --nEpochs 2 \ 43 | --nValSteps 2 \ 44 | -t ${CPU} \ 45 | --seed ${SEED} 46 | 47 | # Predict 48 | ${PREDICT} \ 49 | --vcf ${VCF} \ 50 | --projectDir ${DIR} \ 51 | --seed ${SEED} \ 52 | --minSites ${Minsites} \ 53 | --batchSizeOverride ${batchsize} \ 54 | --phased 55 | 56 | # Parametric Bootstrapping 57 | ${BSCORRECT} \ 58 | --projectDir ${DIR} \ 59 | --nSlice 2 \ 60 | --nReps 2 \ 61 | --seed ${SEED} \ 62 | -t ${CPU} 63 | 64 | ## remove tmp 65 | rm ${VCF} 66 | rsync --delete-before -a /rsync_tmp/ ${DIR}/train/ 67 | rsync --delete-before -a rsync_tmp/ ${DIR}/splitVCFs/ 68 | rsync --delete-before -a /rsync_tmp/ ${DIR}/vali/ 69 | rsync --delete-before -a /rsync_tmp/ ${DIR}/test/ 70 | rsync --delete-before -a /rsync_tmp/ ${DIR}/networks/ 71 | rm -rf ${DIR}/train/ ${DIR}/splitVCFs/ ${DIR}/vali/ ${DIR}/test/ ${DIR}/networks/ 72 | -------------------------------------------------------------------------------- /Comparative_genomics/gene_family_cluster/broccoli.sh: -------------------------------------------------------------------------------- 1 | 2 | indir=data 3 | houzhui=.fa 4 | cpu=10 5 | path_fasttree=/00_tools/FastTree 6 | path_diamond=/00_tools/diamond 7 | 8 | broccoli.py -dir ${indir} -ext ${houzhui} -threads ${cpu} -path_diamond ${path_diamond} -path_fasttree ${path_fasttree} 9 | 10 | < 6 | ## 设定 ideograms 之间的空隙 7 | 8 | # 设置圈图中染色体之间的空隙大小,以下设置为每个空隙大小为周长的 0.5% 9 | default = 0.005r 10 | # 也可以设置指定两条染色体之间的空隙 11 | 12 | spacing = 10u 13 | 14 | 15 | # 设定 ideograms 的位置,以下设定 ideograms 在图离圆心的 90% 处 16 | radius = 0.90r 17 | # 设定 ideograms 的厚度,可以使用 r(比例关系) 或 p(像素)作为单位 18 | thickness = 20p 19 | # 设定 ideograms 是否填充颜色。填充的颜色取决于 karyotype 指定的文件的最后一列 20 | fill = yes 21 | # 设定 ideograms 轮廓的颜色及其厚度。如果没有该参数或设定其厚度为0,则表示没有轮廓 22 | stroke_color = dgrey 23 | stroke_thickness = 1p 24 | ## 设定 label 的显示 25 | # 设定是否显示label。label对应着karyotype文件的第4列。如果其值为yes,则必须要有label_radius参数来设定label的位置,否则会报错并不能生成结果。 26 | show_label = yes 27 | # 设定 label 的字体 28 | label_font = default 29 | # 设定 label 的位置 30 | label_radius = 1r+90p 31 | # 设定 label 的字体大小 32 | label_size = 30 33 | # 设定 label 的字体方向,yes 是易于浏览的方向。 34 | label_parallel = yes 35 | 36 | 37 | <> 38 | 39 | 40 | 41 | 42 | type = histogram 43 | file = DNA_TE_density.txt 44 | fill_color = 219,105,104 45 | r1 = 0.98r 46 | r0 = 0.88r 47 | 48 | 49 | 50 | type = histogram 51 | file = LINE_TE_density.txt 52 | fill_color = 77,151,205 53 | r1 = 0.88r 54 | r0 = 0.78r 55 | 56 | 57 | 58 | type = histogram 59 | file = SINE_TE_density.txt 60 | fill_color = 211,161,196 61 | r1 = 0.78r 62 | r0 = 0.68r 63 | 64 | 65 | 66 | type = histogram 67 | file = LTR_TE_density.txt 68 | fill_color = 147,204,130 69 | r1 = 0.68r 70 | r0 = 0.58r 71 | 72 | 73 | 74 | type = heatmap 75 | file = gene_density.txt 76 | color = oranges-8-seq 77 | r1 = 0.56r 78 | r0 = 0.46r 79 | 80 | 81 | 82 | show = yes 83 | type = line 84 | max = 0.5 85 | min = 0.1 86 | glyph = rectangle 87 | glyph_size = 10 88 | file = GC_content.txt 89 | r1 = 0.46r 90 | r0 = 0.26r 91 | color = red 92 | stroke_color = dred 93 | stroke_thickness = 2 94 | 95 | 96 | 97 | 98 | 99 | 100 | <> 101 | 102 | <> 103 | <> 104 | -------------------------------------------------------------------------------- /Comparative_genomics/kaks/genelist_kaks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ $# == '0' ]]; then 4 | echo " This shell for calculate Ka/Ks by a given gene list" 5 | echo "usage: sh genelist_kaks.sh genelist cpu outdir_tmp all.pep all.cds" 6 | echo " " 7 | echo "example: sh genelist_kaks.sh tlr9_gene 6 test_dir all.pep all.cds " 8 | echo " " 9 | echo "attention: 1. The outdir_tmp is automatically generated! " 10 | echo " 2. Change different outdir_tmp name before using it !!! " 11 | exit 1 12 | fi 13 | 14 | ### change parameter name 15 | GO_term=$1 16 | cpu=$2 17 | outdir_tmp=$3 18 | all_pep=$4 19 | all_cds=$5 20 | 21 | ### software 22 | seqtk=/bin/seqtk 23 | ParaAT=/bin/ParaAT.pl 24 | ${blast_shell}=blast.sh 25 | 26 | ### step1: get gene list 27 | cp ${GO_term} ${GO_term}.all.gene.list 28 | 29 | ### step2: get cds & pep 30 | ${seqtk} subseq ${all_cds} ${GO_term}.all.gene.list > ${GO_term}.all.gene.list.cds 31 | ${seqtk} subseq ${all_pep} ${GO_term}.all.gene.list > ${GO_term}.all.gene.list.pep 32 | 33 | ### step3: blastn and get gene pair 34 | sh ${GO_term}.all.gene.list.cds ${GO_term}.all.gene.list.cds nucl blastn result.txt ${cpu} 35 | grep -v "#" result.txt | awk '$9/$8>0.6||$9/$7>0.6' | cut -f 1,4 | awk '$1!=$2' | awk '!a[$1,$2] && !a[$2,$1]++' > ${GO_term}.gene.pair 36 | 37 | ### step4: make cpu file 38 | echo ${cpu} > procpu 39 | 40 | ### step5: calculate Ka/Ks 41 | ${ParaAT} -h ${GO_term}.gene.pair -n ${GO_term}.all.gene.list.cds -a ${GO_term}.all.gene.list.pep -m clustalw2 -p procpu -f axt -g -k -o ${outdir_tmp} 42 | 43 | ### step6: deal output 44 | cat ./${outdir_tmp}/*kaks |awk 'NR==1;NR>=1 { print $0| "grep -v Sequence"}' > ${GO_term}.all.kaks.result.xls 45 | #less all.kaks.result.xls |cut -f 5|grep -v 'NA' > kaks.list 46 | 47 | ### step7: remove tmpfile 48 | rm -rf *all.gene.list* blastdb procpu ${outdir_tmp} result.txt 49 | 50 | ### ParaAT.pl help 51 | # ParaAT.pl -h test.homologs -n test.cds -a test.pep -p proc -o output -f axt 52 | #-------------------------------- 53 | #-h, 指定同源基因列表文件 54 | #-n, 指定核酸序列文件 55 | #-a, 指定蛋白序列文件 56 | #-p, 指定多线程文件 ## 文件中给定线程数,默认为6 57 | #-m, 指定比对工具 ## muscle 58 | #-g, 去除比对有gap的密码子 59 | #-k, 用KaKs_Calculator ## 计算kaks值 60 | #-o, 输出结果的目录 61 | #-f, 输出比对文件的格式 62 | -------------------------------------------------------------------------------- /deal_fasta/agp2fa/agp2fa.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | # parse input options 5 | if (not $ARGV[0] or $ARGV[0] eq "-h") { 6 | die " 7 | Description: read AGP produced by cluster2agp.pl and output the chromosome-level scaffold sequences in multi-line FASTA format. 8 | 9 | Author: Sen Wang, wangsen1993@163.com, 2021/7/26. 10 | Usage: perl agp2fasta.pl gfa.cluster.agp contigs.fasta > gfa.cluster.agp.fasta 11 | \n"; 12 | } 13 | 14 | # read cluster.agp 15 | my (%scaffold, %contig); 16 | open IN, "<$ARGV[0]" or die "Cannot open $ARGV[0]!\n"; 17 | while () { 18 | chomp; 19 | my @f = split(/\t/, $_); 20 | if ($f[0] ne $f[5]) { 21 | push @{$scaffold{$f[0]}}, "$f[5]$f[8]"; 22 | } else { 23 | $contig{$f[0]} = $f[5]; 24 | } 25 | } 26 | close IN; 27 | 28 | # read contigs.fasta 29 | my %seqs; 30 | my $header; 31 | open IN, "<$ARGV[1]" or die "Cannot open $ARGV[1]!\n"; 32 | while () { 33 | chomp; 34 | if (/^>(\w+)/) { 35 | $header = $1; 36 | } else { 37 | $seqs{$header} .= $_; 38 | } 39 | } 40 | close IN; 41 | 42 | # output chromosome-level scaffold sequences 43 | foreach my $s (sort keys %scaffold) { 44 | print ">$s\n"; 45 | my $seq = ""; 46 | foreach my $c (@{$scaffold{$s}}) { 47 | my $ctg = substr($c, 0, length($c) - 1); 48 | my $strand = substr($c, -1, 1); 49 | if ($strand eq '+') { 50 | die "Cannot get the sequence of $ctg! check $ARGV[1]!\n" if not $seqs{$ctg}; 51 | $seq .= $seqs{$ctg}; 52 | } elsif ($strand eq '-') { 53 | die "Cannot get the sequence of $ctg! check $ARGV[1]!\n" if not $seqs{$ctg}; 54 | my $tem = $seqs{$ctg}; 55 | $tem = reverse($tem); 56 | $tem =~ tr/ATCG/TAGC/; 57 | $seq .= $tem; 58 | } else { 59 | $ctg =~ /(\d+)/; 60 | $seq .= "N" x $1; 61 | } 62 | } 63 | for (my $i = 0; $i < length($seq); $i += 60) { 64 | my $sub = substr($seq, $i, 60); 65 | print "$sub\n"; 66 | } 67 | } 68 | foreach my $c (sort keys %contig) { 69 | print ">$c\n"; 70 | my $seq = $seqs{$c}; 71 | die "Cannot get the sequence of $c! check $ARGV[1]!\n" if not $seqs{$c}; 72 | for (my $i = 0; $i < length($seq); $i += 60) { 73 | my $sub = substr($seq, $i, 60); 74 | print "$sub\n"; 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /Comparative_genomics/kaks/go_kaks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ $# == '0' ]]; then 4 | echo " This shell for select gene in GO_term and calculate Ka/Ks" 5 | echo "usage: sh go_kaks.sh GO_term cpu outdir_tmp all_go_term_file all.pep all.cds" 6 | echo " " 7 | echo "example: sh go_kaks.sh GO_1990452 6 test_dir all.difgoall all.pep all.cds " 8 | echo " " 9 | echo "attention: 1. The outdir_tmp is automatically generated! " 10 | echo " 2. Change different outdir_tmp name before using it !!! " 11 | exit 1 12 | fi 13 | 14 | ### change parameter name 15 | GO_term=$1 16 | cpu=$2 17 | outdir_tmp=$3 18 | all_go_term_file=$4 19 | all_pep=$5 20 | all_cds=$6 21 | 22 | ### software 23 | seqtk=/bin/seqtk 24 | ParaAT=/bin/ParaAT.pl 25 | blast_shell=blast.sh 26 | 27 | ### step1: get gene list 28 | grep -w ${GO_term} ${all_go_term_file} | sort | awk '{print $NF}' | sed 's/,/\n/g' > ${GO_term}.all.gene.list 29 | 30 | ### step2: get cds & pep 31 | ${seqtk} subseq ${all_cds} ${GO_term}.all.gene.list > ${GO_term}.all.gene.list.cds 32 | ${seqtk} subseq ${all_pep} ${GO_term}.all.gene.list > ${GO_term}.all.gene.list.pep 33 | 34 | ### step3: blastn and get gene pair 35 | sh ${blast_shell} ${GO_term}.all.gene.list.cds ${GO_term}.all.gene.list.cds nucl blastn result.txt ${cpu} 36 | grep -v "#" result.txt | awk '$9/$8>0.6||$9/$7>0.6' | cut -f 1,4 | awk '$1!=$2' | awk '!a[$1,$2] && !a[$2,$1]++' > ${GO_term}.gene.pair 37 | 38 | ### step4: make cpu file 39 | echo ${cpu} > procpu 40 | 41 | ### step5: calculate Ka/Ks 42 | ${ParaAT} -h ${GO_term}.gene.pair -n ${GO_term}.all.gene.list.cds -a ${GO_term}.all.gene.list.pep -m clustalw2 -p procpu -f axt -g -k -o ${outdir_tmp} 43 | 44 | ### step6: deal output 45 | cat ./${outdir_tmp}/*kaks |awk 'NR==1;NR>=1 { print $0| "grep -v Sequence"}' > ${GO_term}.all.kaks.result.xls 46 | #less all.kaks.result.xls |cut -f 5|grep -v 'NA' > kaks.list 47 | 48 | ### step7: remove tmpfile 49 | rm -rf *all.gene.list* blastdb procpu ${outdir_tmp} result.txt 50 | 51 | ### ParaAT.pl help 52 | # ParaAT.pl -h test.homologs -n test.cds -a test.pep -p proc -o output -f axt 53 | #-------------------------------- 54 | #-h, 指定同源基因列表文件 55 | #-n, 指定核酸序列文件 56 | #-a, 指定蛋白序列文件 57 | #-p, 指定多线程文件 ## 文件中给定线程数,默认为6 58 | #-m, 指定比对工具 ## muscle 59 | #-g, 去除比对有gap的密码子 60 | #-k, 用KaKs_Calculator ## 计算kaks值 61 | #-o, 输出结果的目录 62 | #-f, 输出比对文件的格式 63 | -------------------------------------------------------------------------------- /picture/genome_Circos/fast_Circos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set +o posix 3 | 4 | genome=Stichopus_variegatus.fa 5 | bgi_gff=Stichopus_variegatus.bgi.gff 6 | chr_id=chr.id 7 | repeatmask_out=HiTE.update.out 8 | 9 | ## get chromosome information 10 | seqtk subseq ${genome} ${chr_id} | seqkit seq -w 100 > tmp_chr.fa 11 | fishInWinter.pl -bf table -ff table ${chr_id} ${bgi_gff} > tmp_chr.gff 12 | 13 | ## stat 14 | iTools Fatools stat -InPut ${genome} -OutPut ${genome}.chrlist 15 | grep -v "#" ${genome}.chrlist | grep chr | awk '{print "chr - "$1" "$1" 0 "$2" "$1}' > karyotype.txt 16 | awk '$3=="mRNA"' tmp_chr.gff | awk '{print $1"\t"$4"\t"$5}' | sort -k 1V,1 -k 2n,2 > gene.bed 17 | cut -f 3,6 -d " " karyotype.txt | awk '{print $1"\t"$2}' > chr.length 18 | 19 | ## make windows (no more 2500 windows) 20 | bedtools makewindows -g chr.length -n 70 > chr.window 21 | 22 | ## stat 23 | bedtools coverage -a chr.window -b gene.bed | cut -f 1-4 | sort -k 1V,1 -k 2n,2 > gene_density.txt 24 | bedtools nuc -fi tmp_chr.fa -bed chr.window | cut -f 1,2,3,5 | sed '1d' > GC_content.txt 25 | 26 | ## deal TE 27 | grep "DNA/" ${repeatmask_out} | awk '{print $5"\t"$6"\t"$7}' | fishInWinter.pl -bf table -ff table ${chr_id} - |sort -k 1V,1 -k 2n,2 | bedtools merge -i - > DNA.TE.bed 28 | grep "LINE/" ${repeatmask_out} | awk '{print $5"\t"$6"\t"$7}' | fishInWinter.pl -bf table -ff table ${chr_id} - |sort -k 1V,1 -k 2n,2 | bedtools merge -i - > LINE.TE.bed 29 | grep "SINE/" ${repeatmask_out} | awk '{print $5"\t"$6"\t"$7}' | fishInWinter.pl -bf table -ff table ${chr_id} - |sort -k 1V,1 -k 2n,2 | bedtools merge -i - > SINE.TE.bed 30 | grep "LTR/" ${repeatmask_out} | awk '{print $5"\t"$6"\t"$7}' | fishInWinter.pl -bf table -ff table ${chr_id} - |sort -k 1V,1 -k 2n,2 | bedtools merge -i - > LTR.TE.bed 31 | bedtools coverage -a chr.window -b DNA.TE.bed | cut -f 1-4 | sort -k 1V,1 -k 2n,2 > DNA_TE_density.txt 32 | bedtools coverage -a chr.window -b LINE.TE.bed | cut -f 1-4 | sort -k 1V,1 -k 2n,2 > LINE_TE_density.txt 33 | bedtools coverage -a chr.window -b SINE.TE.bed | cut -f 1-4 | sort -k 1V,1 -k 2n,2 > SINE_TE_density.txt 34 | bedtools coverage -a chr.window -b LTR.TE.bed | cut -f 1-4 | sort -k 1V,1 -k 2n,2 > LTR_TE_density.txt 35 | 36 | rm tmp_chr.fa tmp_chr.gff DNA.TE.bed LINE.TE.bed SINE.TE.bed LTR.TE.bed *.fai 37 | 38 | ## circos 39 | cp /02_pilple/circos/circos.conf . 40 | cp /02_pilple/circos/ticks.conf . 41 | 42 | /usr/bin/singularity exec --bind $PWD:$PWD /01_soft/singularity_all/circos.sif circos -conf circos.conf 43 | -------------------------------------------------------------------------------- /python/zscore/zscore2.py: -------------------------------------------------------------------------------- 1 | # Usage: python zscore_large_file_parallel.py input_file output_file num_threads chunksize 2 | 3 | import sys 4 | import pandas as pd 5 | import numpy as np 6 | from concurrent.futures import ThreadPoolExecutor 7 | 8 | def calculate_zscore(values, mean, std): 9 | """计算 Z-score""" 10 | return (values - mean) / std 11 | 12 | def process_chunk(chunk, mean, std, last_column_name): 13 | """处理单个分块,计算 Z-score 并新增列""" 14 | chunk[f"{last_column_name}_zscore"] = calculate_zscore(chunk[last_column_name].to_numpy(dtype=float), mean, std) 15 | return chunk 16 | 17 | def calculate_zscore_in_chunks_parallel(input_file, output_file, num_threads, chunksize): 18 | # 首先计算最后一列的全局均值和标准差(两次遍历文件) 19 | total_sum, total_sq_sum, total_count = 0, 0, 0 20 | last_column_name = None 21 | 22 | # 第一次遍历:计算全局均值和标准差 23 | for chunk in pd.read_csv(input_file, sep='\t', chunksize=chunksize): 24 | if last_column_name is None: 25 | last_column_name = chunk.columns[-1] 26 | last_column_values = chunk[last_column_name].to_numpy(dtype=float) # 提取最后一列为 NumPy 数组 27 | total_sum += np.sum(last_column_values) 28 | total_sq_sum += np.sum(last_column_values**2) 29 | total_count += len(last_column_values) 30 | 31 | # 计算全局均值和标准差 32 | mean = total_sum / total_count 33 | std = np.sqrt(total_sq_sum / total_count - mean**2) 34 | 35 | # 第二次遍历:多线程并行计算 Z-score,并写入文件 36 | with open(output_file, 'w') as f_out: 37 | header_written = False # 用于控制是否写入表头 38 | with ThreadPoolExecutor(max_workers=num_threads) as executor: 39 | for chunk in pd.read_csv(input_file, sep='\t', chunksize=chunksize): 40 | # 提交任务到线程池,处理单个分块 41 | future = executor.submit(process_chunk, chunk, mean, std, last_column_name) 42 | processed_chunk = future.result() 43 | 44 | # 写入文件,第一次写入表头,后续不再写入 45 | processed_chunk.to_csv(f_out, sep='\t', index=False, header=not header_written) 46 | header_written = True # 写入一次表头后设置为 True 47 | 48 | if __name__ == "__main__": 49 | # 获取命令行参数 50 | if len(sys.argv) < 5: 51 | print("Usage: python zscore_large_file_parallel.py ") 52 | sys.exit(1) 53 | 54 | input_file = sys.argv[1] 55 | output_file = sys.argv[2] 56 | num_threads = int(sys.argv[3]) # 自定义线程数 57 | chunksize = int(sys.argv[4]) # 自定义分块大小 58 | 59 | # 调用主函数 60 | calculate_zscore_in_chunks_parallel(input_file, output_file, num_threads, chunksize) 61 | -------------------------------------------------------------------------------- /picture/syri_plotsv/syri_plotsv.sh: -------------------------------------------------------------------------------- 1 | ## https://github.com/nschan/nf-plotsv 2 | 3 | ## Use relative paths 4 | ref_genome=final-father-chr-mela.fa 5 | ref_genome_chr_num=24 6 | query_genome=final-mother-chr-mela.fa 7 | query_genome_chr_num=24 8 | cpu_for_minimap=20 9 | mem_for_minimap=100 10 | plot_all_chr_num=24 11 | plotsr_Space_for_homologous_chromosome=0.7 12 | plotsr_height=10 13 | plotsr_width=7 14 | plotsr_font_size=8 15 | plotsr_minimum_size_of_SR_to_be_plotted=5000 16 | 17 | export PATH="/00_tools/:$PATH" 18 | 19 | ## rename chromesome 20 | iTools Fatools stat -InPut ${ref_genome} -OutPut ${ref_genome}.chrlen 21 | iTools Fatools stat -InPut ${query_genome} -OutPut ${query_genome}.chrlen 22 | grep -v "#" ${ref_genome}.chrlen | sort -k 2nr,2 | head -n ${ref_genome_chr_num} | awk '{print $1}' | seqtk subseq ${ref_genome} - | seqtk rename - Chr | seqkit seq -w 100 - > ref.rename.fa 23 | grep -v "#" ${query_genome}.chrlen | sort -k 2nr,2 | head -n ${query_genome_chr_num} | awk '{print $1}' | seqtk subseq ${query_genome} - | seqtk rename - Chr | seqkit seq -w 100 - > query.rename.fa 24 | 25 | ## make samplesheet 26 | echo "name,fasta" >> samplesheet.csv 27 | ## the names also be used in the final picture. 28 | echo "ref,$PWD/ref.rename.fa" >> samplesheet.csv 29 | echo "query,$PWD/query.rename.fa" >> samplesheet.csv 30 | 31 | ## run 32 | cp /01_soft/nf-plotsv/configs/base.demo.config base.config 33 | sed -i "s/ABCD/${cpu_for_minimap}/g;s/EFGH/${mem_for_minimap}/g" base.config 34 | nextflow -config $PWD/base.config run /01_soft/nf-plotsv --samplesheet samplesheet.csv -profile local --reference ref_genome --ref_genome $PWD/ref.rename.fa --subset_pattern Chr[1-9] --reorient true 35 | 36 | ## re-draw 37 | cp work/*/*/plotsr_infile.tsv plotsv/syri_pairwise/plotsr_infile.tsv 38 | cp work/*/*/files.txt plotsv/syri_pairwise/files.txt 39 | cp /nf-plotsv/assets/plotsr_config.conf plotsv/syri_pairwise/plotsr_config.conf 40 | for i in `seq 1 ${plot_all_chr_num}` ; do echo Chr${i} >> plotsv/syri_pairwise/chr.order ; done 41 | 42 | cd plotsv/syri_pairwise/ 43 | cp ../align_pairwise/*fa ./ 44 | sr=$(cat files.txt) 45 | singularity run /01_soft/singularity_all/fixchr-syri-plotsr.sif plotsr --genomes plotsr_infile.tsv ${sr} --cfg plotsr_config.conf -o replot.pdf -S ${plotsr_Space_for_homologous_chromosome} -W ${plotsr_width} -H ${plotsr_height} -f ${plotsr_font_size} -s ${plotsr_minimum_size_of_SR_to_be_plotted} --chrord chr.order 46 | pigz --best -p ${cpu_for_minimap} *syri.out 47 | pigz --best -p ${cpu_for_minimap} *syri.vcf 48 | pigz --best -p ${cpu_for_minimap} *.fa 49 | cd ../../ 50 | 51 | ## rm tmp 52 | rm -rf .nextflow* work ref.rename.fa query.rename.fa *chrlen *.chrlist samplesheet.csv base.config plotsv/prepare_genomes plotsv/align_pairwise 53 | -------------------------------------------------------------------------------- /Comparative_genomics/blast/diamond_rbh.R: -------------------------------------------------------------------------------- 1 | 2 | library(optparse) 3 | option_list <- list( 4 | make_option(c("-v", "--info"), type = "character", default=F, metavar="info", 5 | help="The firt version to find Reciprocal best hit using diamond.\n\t\t!!! Example: diamond_rbh.R -a a.pep -b b.pep -c 10 -e 1E-3 -m 1 -M ultra-sensitive !!!"), 6 | make_option(c("-a", "--speA"), type = "character", default=NULL, metavar="pepA.fa", 7 | help="pep.fasta of Species A"), 8 | make_option(c("-b", "--speB"), type="character", default=NULL, metavar="pepB.fa", 9 | help="pep.fasta of Species B"), 10 | make_option(c("-o", "--output"), type="character", default='output.csv', metavar="output", 11 | help="RBH result"), 12 | make_option(c("-c", "--cpu"), type="integer", default=1, metavar="Number", 13 | help="cpu number. defult: 1"), 14 | make_option(c("-e", "--evalue"), default=1E-3, metavar="Number", 15 | help="Expectation value. defult: 1E-3"), 16 | make_option(c("-m", "--max"), type="integer", default=1, metavar="Number", 17 | help="maximum number of aligned sequences that shall be retained. defult: 1"), 18 | make_option(c("-M", "--model"), default='ultra-sensitive', metavar="aligned model", 19 | help="sensitivity_mode: defult: ultra-sensitive 20 | fast : fastest alignment mode, but least sensitive (default). Designed for finding hits of >70. 21 | mid-sensitive : fast alignments between the fast mode and the sensitive mode in sensitivity. 22 | sensitive : fast alignments, but full sensitivity for hits >40. 23 | more-sensitive : more sensitive than the sensitive mode. 24 | very-sensitive : sensitive alignment mode. 25 | ultra-sensitive : most sensitive alignment mode (sensitivity as high as BLASTP).") 26 | ) 27 | 28 | opt_parser = OptionParser(option_list=option_list); 29 | opt = parse_args(opt_parser); 30 | 31 | if (is.null(opt$speA)){ 32 | print_help(opt_parser)} 33 | 34 | speA = opt$speA 35 | speB = opt$speB 36 | cpu = opt$cpu 37 | evalue = opt$evalue 38 | max = opt$max 39 | model = opt$model 40 | output = opt$output 41 | 42 | library("homologr") 43 | 44 | rec_best_hits <- diamond_reciprocal_best_hits( 45 | query = speA, 46 | subject = speB, 47 | is_subject_db = FALSE, 48 | format = "fasta", 49 | sensitivity_mode = model, 50 | out_format = "csv", 51 | evalue = evalue, 52 | max_target_seqs = max, 53 | cores = cpu, 54 | hard_mask = TRUE, 55 | diamond_exec_path = "", 56 | add_makedb_options = NULL, 57 | add_diamond_options = NULL, 58 | output_path = getwd() 59 | ) 60 | 61 | write.csv(rec_best_hits,file=output,quote=F,row.names = F) 62 | 63 | -------------------------------------------------------------------------------- /deal_gff/gff.simple/EVMtoBGI.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import OrderedDict 3 | from types import SimpleNamespace 4 | 5 | class Record(object): 6 | def __init__(self, line): 7 | super(Record, self).__init__() 8 | self.record = line.strip('\n') 9 | lst = self.record.split('\t') 10 | self.nonA = lst[:-1] 11 | self.length = abs(int(lst[3]) - int(lst[4])) + 1 12 | self.feature = lst[2] 13 | if self.feature in ['transcript', 'primary_transcript']: 14 | self.feature = 'mRNA' 15 | attribute = {k:v for k, v in [x.split('=') for x in lst[8].split(';') if x]} 16 | attribute.update({'record':lst[8]}) 17 | self.attribute = SimpleNamespace(**attribute) 18 | def __str__(self): 19 | if self.feature == 'mRNA': 20 | attribute = {'record':'='.join(['ID', self.attribute.ID]), 21 | 'ID':self.attribute.ID} 22 | elif self.feature == 'CDS': 23 | attribute = {'record':'='.join(['Parent', self.attribute.Parent]), 24 | 'Parent':self.attribute.Parent} 25 | self.attribute = SimpleNamespace(**attribute) 26 | formal = self.nonA + [self.attribute.record + ';\n'] 27 | return '\t'.join(formal) 28 | 29 | def makedict(d, k, v): 30 | if d.get(k): 31 | d[k].append(v) 32 | else: 33 | d[k] = [v] 34 | return d 35 | 36 | def getbest(infile): 37 | gene = OrderedDict() 38 | cds = OrderedDict() 39 | with open(infile) as r: 40 | for line in r: 41 | if not line.startswith("#") and not line.startswith("\n"): 42 | r = Record(line) 43 | if r.feature == 'mRNA': 44 | gene = makedict(gene, r.attribute.Parent, r) 45 | elif r.feature == 'CDS': 46 | cds = makedict(cds, r.attribute.Parent, r) 47 | 48 | best = OrderedDict() 49 | for geneID, mrnas in gene.items(): 50 | cdslen = [(x, sum([c.length for c in cds[x.attribute.ID]])) for x in mrnas if cds.get(x.attribute.ID)] 51 | mrnas = sorted(cdslen, key=lambda x:x[1], reverse=True) 52 | #mrnas = sorted(mrnas, key=lambda x:x.length, reverse=True) 53 | if not mrnas: 54 | continue 55 | best[geneID] = mrnas[0][0] 56 | return best, cds 57 | 58 | if __name__ == "__main__": 59 | if len(sys.argv[1:]) != 1: 60 | sys.stderr.write('usage: python {} ingff > outgff\n'.format(__file__)) 61 | sys.exit() 62 | else: 63 | infile = sys.argv[1] 64 | best, cds = getbest(infile) 65 | #print(len(best)) 66 | for geneID, mrna in best.items(): 67 | sys.stdout.write(str(mrna)) 68 | if cds.get(mrna.attribute.ID): 69 | children = cds[mrna.attribute.ID] 70 | for child in children: 71 | sys.stdout.write(str(child)) 72 | -------------------------------------------------------------------------------- /other/count_directory_num_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Define the output file 4 | output_file="count_size_dir.txt" 5 | 6 | # Define the maximum depth (modifiable variable) 7 | max_depth=6 8 | 9 | # Clear or create the output file 10 | > "$output_file" 11 | 12 | echo "Counting files and their sizes in KB (including hidden files and links, ensuring recursive accumulation beyond max depth) up to depth $max_depth..." 13 | 14 | # Function to recursively count files and aggregate counts and sizes up to the parent 15 | function count_files_and_sizes { 16 | local dir="$1" 17 | local current_depth="$2" 18 | 19 | # Initialize counters for this directory 20 | local file_count=0 21 | local total_size=0 22 | 23 | # Count files and their sizes directly in this directory 24 | local direct_file_count=$(find "$dir" -maxdepth 1 \( -type f -o -type l \) | wc -l) 25 | local direct_size=$(find "$dir" -maxdepth 1 \( -type f -o -type l \) -exec du -b {} + | awk '{sum += $1} END {print sum}') 26 | direct_size=${direct_size:-0} 27 | 28 | # Add direct counts and sizes to the totals 29 | file_count=$((file_count + direct_file_count)) 30 | total_size=$((total_size + direct_size)) 31 | 32 | # If current depth is less than or equal to max depth, process subdirectories 33 | if [ "$current_depth" -lt "$max_depth" ]; then 34 | for subdir in "$dir"/* "$dir"/.*; do 35 | if [ -d "$subdir" ] && [ "$subdir" != "$dir/." ] && [ "$subdir" != "$dir/.." ] && [ ! -L "$subdir" ]; then 36 | # Recursively get file counts and sizes from subdirectories 37 | local sub_count_and_size=$(count_files_and_sizes "$subdir" $((current_depth + 1))) 38 | local sub_count=$(echo "$sub_count_and_size" | awk '{print $1}') 39 | local sub_size=$(echo "$sub_count_and_size" | awk '{print $2}') 40 | file_count=$((file_count + sub_count)) 41 | total_size=$((total_size + sub_size)) 42 | fi 43 | done 44 | elif [ "$current_depth" -eq "$max_depth" ]; then 45 | # If at max depth, include all files recursively from this point 46 | local deeper_count=$(find "$dir" -type f -o -type l | wc -l) 47 | local deeper_size=$(find "$dir" -type f -o -type l -exec du -b {} + | awk '{sum += $1} END {print sum}') 48 | deeper_size=${deeper_size:-0} 49 | file_count=$((file_count + deeper_count)) 50 | total_size=$((total_size + deeper_size)) 51 | fi 52 | 53 | # Convert size to KB 54 | local total_size_mb=$(echo "scale=2; $total_size / 1024" | bc) 55 | 56 | # Output this directory 57 | local abs_path=$(realpath "$dir") 58 | echo -e "$file_count\t$total_size_mb\t$abs_path" >> "$output_file" 59 | 60 | # Return the file count and total size for this directory 61 | echo "$file_count $total_size" 62 | } 63 | 64 | # Start counting from the current directory with an initial depth of 1 65 | count_files_and_sizes "." 1 66 | 67 | echo "Counting completed. Results are saved in $output_file." 68 | -------------------------------------------------------------------------------- /genome/Anno_EGAPx/toBGI.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | from collections import OrderedDict 5 | from types import SimpleNamespace 6 | 7 | class Record(object): 8 | def __init__(self, line): 9 | super(Record, self).__init__() 10 | self.record = line.strip('\n') 11 | lst = self.record.split('\t') 12 | self.nonA = lst[:-1] 13 | self.length = abs(int(lst[3]) - int(lst[4])) + 1 14 | self.feature = lst[2] 15 | if self.feature in ['transcript', 'primary_transcript']: 16 | self.feature = 'mRNA' 17 | attribute = {k:v for k, v in [x.split('=') for x in lst[8].split(';') if x]} 18 | attribute.update({'record':lst[8]}) 19 | self.attribute = SimpleNamespace(**attribute) 20 | def __str__(self): 21 | if self.feature == 'mRNA': 22 | attribute = {'record':'='.join(['ID', self.attribute.ID]), 23 | 'ID':self.attribute.ID} 24 | elif self.feature == 'CDS': 25 | attribute = {'record':'='.join(['Parent', self.attribute.Parent]), 26 | 'Parent':self.attribute.Parent} 27 | self.attribute = SimpleNamespace(**attribute) 28 | formal = self.nonA + [self.attribute.record + ';\n'] 29 | return '\t'.join(formal) 30 | 31 | def makedict(d, k, v): 32 | if d.get(k): 33 | d[k].append(v) 34 | else: 35 | d[k] = [v] 36 | return d 37 | 38 | def getbest(infile): 39 | gene = OrderedDict() 40 | cds = OrderedDict() 41 | with open(infile) as r: 42 | for line in r: 43 | if not line.startswith("#") and not line.startswith("\n"): 44 | r = Record(line) 45 | if r.feature == 'mRNA': 46 | gene = makedict(gene, r.attribute.Parent, r) 47 | elif r.feature == 'CDS': 48 | cds = makedict(cds, r.attribute.Parent, r) 49 | 50 | best = OrderedDict() 51 | for geneID, mrnas in gene.items(): 52 | cdslen = [(x, sum([c.length for c in cds[x.attribute.ID]])) for x in mrnas if cds.get(x.attribute.ID)] 53 | mrnas = sorted(cdslen, key=lambda x:x[1], reverse=True) 54 | #mrnas = sorted(mrnas, key=lambda x:x.length, reverse=True) 55 | if not mrnas: 56 | continue 57 | best[geneID] = mrnas[0][0] 58 | return best, cds 59 | 60 | if __name__ == "__main__": 61 | if len(sys.argv[1:]) != 1: 62 | sys.stderr.write('usage: python {} ingff > outgff\n'.format(__file__)) 63 | sys.exit() 64 | else: 65 | infile = sys.argv[1] 66 | best, cds = getbest(infile) 67 | #print(len(best)) 68 | for geneID, mrna in best.items(): 69 | sys.stdout.write(str(mrna)) 70 | if cds.get(mrna.attribute.ID): 71 | children = cds[mrna.attribute.ID] 72 | for child in children: 73 | sys.stdout.write(str(child)) 74 | 75 | -------------------------------------------------------------------------------- /deal_gff/pick_longest_gene/toBGI.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | from collections import OrderedDict 5 | from types import SimpleNamespace 6 | 7 | class Record(object): 8 | def __init__(self, line): 9 | super(Record, self).__init__() 10 | self.record = line.strip('\n') 11 | lst = self.record.split('\t') 12 | self.nonA = lst[:-1] 13 | self.length = abs(int(lst[3]) - int(lst[4])) + 1 14 | self.feature = lst[2] 15 | if self.feature in ['transcript', 'primary_transcript']: 16 | self.feature = 'mRNA' 17 | attribute = {k:v for k, v in [x.split('=') for x in lst[8].split(';') if x]} 18 | attribute.update({'record':lst[8]}) 19 | self.attribute = SimpleNamespace(**attribute) 20 | def __str__(self): 21 | if self.feature == 'mRNA': 22 | attribute = {'record':'='.join(['ID', self.attribute.ID]), 23 | 'ID':self.attribute.ID} 24 | elif self.feature == 'CDS': 25 | attribute = {'record':'='.join(['Parent', self.attribute.Parent]), 26 | 'Parent':self.attribute.Parent} 27 | self.attribute = SimpleNamespace(**attribute) 28 | formal = self.nonA + [self.attribute.record + ';\n'] 29 | return '\t'.join(formal) 30 | 31 | def makedict(d, k, v): 32 | if d.get(k): 33 | d[k].append(v) 34 | else: 35 | d[k] = [v] 36 | return d 37 | 38 | def getbest(infile): 39 | gene = OrderedDict() 40 | cds = OrderedDict() 41 | with open(infile) as r: 42 | for line in r: 43 | if not line.startswith("#") and not line.startswith("\n"): 44 | r = Record(line) 45 | if r.feature == 'mRNA': 46 | gene = makedict(gene, r.attribute.Parent, r) 47 | elif r.feature == 'CDS': 48 | cds = makedict(cds, r.attribute.Parent, r) 49 | 50 | best = OrderedDict() 51 | for geneID, mrnas in gene.items(): 52 | cdslen = [(x, sum([c.length for c in cds[x.attribute.ID]])) for x in mrnas if cds.get(x.attribute.ID)] 53 | mrnas = sorted(cdslen, key=lambda x:x[1], reverse=True) 54 | #mrnas = sorted(mrnas, key=lambda x:x.length, reverse=True) 55 | if not mrnas: 56 | continue 57 | best[geneID] = mrnas[0][0] 58 | return best, cds 59 | 60 | if __name__ == "__main__": 61 | if len(sys.argv[1:]) != 1: 62 | sys.stderr.write('usage: python {} ingff > outgff\n'.format(__file__)) 63 | sys.exit() 64 | else: 65 | infile = sys.argv[1] 66 | best, cds = getbest(infile) 67 | #print(len(best)) 68 | for geneID, mrna in best.items(): 69 | sys.stdout.write(str(mrna)) 70 | if cds.get(mrna.attribute.ID): 71 | children = cds[mrna.attribute.ID] 72 | for child in children: 73 | sys.stdout.write(str(child)) 74 | 75 | -------------------------------------------------------------------------------- /deal_fasta/fa2phy/fasta2phylip.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | ###################################################################################### 4 | # This script takes alignment sequence fasta file and converts it to phylip file 5 | # Author: Wenjie Deng 6 | # Date: 2007-01-29 7 | # Usage: perl fasta2phylip.pl 8 | ###################################################################################### 9 | use strict; 10 | 11 | my $usage = "Usage: perl fasta2phylip.pl \n"; 12 | my $infile = shift or die($usage); # input nexus file 13 | my $outFile = shift or die($usage); # output phylip file 14 | my $unixFile = $infile.".unix"; 15 | 16 | ConvertToUnix ($infile, $unixFile); 17 | ChangetoPhylip($unixFile, $outFile); 18 | unlink ($unixFile); 19 | print "All done!\n"; 20 | 21 | exit 0; 22 | 23 | 24 | ###################################################################################### 25 | sub ConvertToUnix { 26 | my ($infile, $unixFile) = @_; 27 | open (IN, $infile) or die "Couldn't open $infile: $!\n"; 28 | open (OUT, ">$unixFile") or die "Couldn't open $unixFile: $!\n"; 29 | my @buffer = ; 30 | close IN; 31 | my $line = ""; 32 | foreach my $element (@buffer) { 33 | $line .= $element; 34 | } 35 | if ($line =~ /\r\n/) { 36 | $line =~ s/\r//g; 37 | }elsif ($line =~ /\r/) { 38 | $line =~ s/\r/\n/g; 39 | } 40 | print OUT $line; 41 | close OUT; 42 | } 43 | 44 | 45 | ###################################################################################### 46 | sub ChangetoPhylip { 47 | my ($unixFile, $phylipFile) = @_; 48 | my $seqCount = 0; 49 | my $seq = my $seqName = ""; 50 | open IN, $unixFile or die "Couldn't open $unixFile\n"; 51 | while (my $line = ) { 52 | chomp $line; 53 | next if $line =~ /^\s*$/; 54 | if ($line =~ /^>/) { 55 | $seqCount++; 56 | }elsif ($seqCount == 1) { 57 | $seq .= $line; 58 | } 59 | } 60 | close IN; 61 | my $seqLen = length $seq; 62 | 63 | open(IN, $unixFile) || die "Can't open $unixFile\n"; 64 | open(OUT, ">$phylipFile") || die "Cant open $phylipFile\n"; 65 | print OUT $seqCount," ",$seqLen,"\n"; 66 | $seqCount = 0; 67 | $seq = ""; 68 | while(my $line = ) { 69 | chomp $line; 70 | next if($line =~ /^\s*$/); 71 | 72 | if($line =~ /^>(\S+)/) { 73 | if ($seqCount) { 74 | my $len = length $seq; 75 | if ($len == $seqLen) { 76 | print OUT "$seqName\t$seq\n"; 77 | $seq = $seqName = ""; 78 | }else { 79 | unlink $unixFile; 80 | unlink $phylipFile; 81 | die "Error: the sequence length of $seqName is not same as others.\n"; 82 | } 83 | } 84 | $seqName = $1; 85 | $seqCount++; 86 | }else { 87 | $seq .= $line; 88 | } 89 | } 90 | close IN; 91 | # check the length of last sequence 92 | my $len = length $seq; 93 | if ($len == $seqLen) { 94 | print OUT "$seqName\t$seq\n"; 95 | }else { 96 | unlink $unixFile; 97 | unlink $phylipFile; 98 | die "Error: the sequence length of $seqName is not same as others.\n"; 99 | } 100 | close IN; 101 | close OUT; 102 | } 103 | -------------------------------------------------------------------------------- /genome/Anno_integrate/pasa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set +o posix 3 | 4 | ### need copy and change *config to your work path !!!!!!!!!!! 5 | 6 | ### export env 7 | PASAPIPELINE=/01_soft/PASApipeline.v2.5.3 8 | export PATH=${PASAPIPELINE}:${PASAPIPELINE}/scripts/:${PASAPIPELINE}/misc_utilities/:$PATH 9 | unset PERL5LIB; export PATH=/softs/perl-5.30.2/bin:$PATH 10 | export PATH=/blast-2.2.26/bin/:$PATH ### blast 11 | export PATH=/gmap-2021-08-25/bin:$PATH ### gmap 12 | export PATH=${PASAPIPELINE}/bin:$PATH ### minimap2 pblat blat and other dependency softwares 13 | export PATH=/01_software/TransDecoder-TransDecoder-v5.5.0/:$PATH ### TransDecoder 14 | export PATH=/01_soft/PASApipeline.v2.5.3/bin:$PATH 15 | 16 | ### change parameter 17 | genome=HCS_chr.fa 18 | gff=HCS.EVM.bgi.filter.gff 19 | gff3=${gff}.gff3 20 | trans=trans.cdhit.rename.fa 21 | trans_clean=${trans}.clean 22 | cpu=73 23 | max_intron_legth=2000000 24 | config1=alignAssembly.config 25 | config2=annotCompare.config 26 | align_software=minimap2 ### gmap blat minimap2 pblat 27 | stringent_alignment_overlap=30 ### overlapping transcripts must have this min % overlap to be clustered. 28 | gene_overlap=50 ### transcripts overlapping existing gene annotations are clustered. Intergenic alignments are clustered by default mechanism. 29 | 30 | ### step1: clean trans 31 | mkdir step1_clean 32 | cd step1_clean 33 | ln -s ../${trans} ./ 34 | ${PASAPIPELINE}/bin/seqclean $trans -c 15 -v /00_tools/Clean-fasta/UniVec 35 | cd ../ 36 | ln -s step1_clean/${trans}* ./ 37 | 38 | ### step2: align 39 | perl change_gff_format.pl ${gff} ${gff3} 40 | ${PASAPIPELINE}/bin/samtools faidx ./${trans} 41 | ${PASAPIPELINE}/Launch_PASA_pipeline.pl --config $config1 --annot_compare --ALT_SPLICE --create --replace --run --genome ./$genome --transcripts ./$trans_clean --ALIGNERS $align_software -T -u ./$trans --CPU $cpu --MAX_INTRON_LENGTH ${max_intron_legth} --TRANSDECODER --stringent_alignment_overlap ${stringent_alignment_overlap} --annots ${gff3} 42 | 43 | ## step2: compare 44 | ${PASAPIPELINE}/scripts/build_comprehensive_transcriptome.dbi -c $config1 -t $trans_clean --min_per_ID 95 --min_per_aligned 30 45 | ## annot_compare_R1 46 | ${PASAPIPELINE}/Launch_PASA_pipeline.pl -c $config2 -g ${genome} -t $trans_clean -A -L --annots $gff3 --CPU $cpu 47 | ## annot_compare_R2 ### Attention here 48 | recent_update_file=$(ls -t *gene_structures_post_PASA_updates.*.gff3 | head -n 1) 49 | ${PASAPIPELINE}/Launch_PASA_pipeline.pl -c $config2 -g ${genome} -t $trans_clean -A -L --annots $recent_update_file --CPU $cpu 50 | 51 | ## step3: alt_splice_analysis 52 | ${PASAPIPELINE}/Launch_PASA_pipeline.pl -c $config2 -g ${genome} -t $trans_clean --CPU $cpu --ALT_SPLICE 53 | 54 | ## step4: find_orfs_in_pasa_assemblies ### Attention here 55 | DBname_assemblies_fasta=$(ls *assemblies.fasta) 56 | DBname_pasa_assemblies_gff3=$(ls *pasa_assemblies.gff3) 57 | ${PASAPIPELINE}/scripts/pasa_asmbls_to_training_set.dbi --pasa_transcripts_fasta $DBname_assemblies_fasta --pasa_transcripts_gff3 $DBname_pasa_assemblies_gff3 58 | -------------------------------------------------------------------------------- /transcriptome/Enrich/AnnotationForge_20250117.R: -------------------------------------------------------------------------------- 1 | #!/01_software/miniconda3/envs/R-4.0/bin/Rscript 2 | 3 | ### https://www.jianshu.com/p/45f1e8c9b79c 4 | 5 | ### help doc 6 | library(optparse) 7 | option_list <- list( 8 | make_option(c("-v", "--info"), type = "character", default=F, metavar="info", 9 | help="Building non-model object Orgdb packages.\n\t\t!!! Example: ./AnnotationForge.R -i eggNog.anno.txt -a LS -m shiyeyishang@outlook.com -g Cynoglossus -s se -d 244447 !!!"), 10 | make_option(c("-i", "--input"), type = "character", default=NULL, 11 | help="annotation from eggNOG"), 12 | make_option(c("-a", "--author"), type="character", default=NULL, 13 | help="author"), 14 | make_option(c("-m", "--mail"), type="character", default=NULL, 15 | help="e-mail"), 16 | make_option(c("-g", "--genus"), type="character", default=NULL, 17 | help="genus"), 18 | make_option(c("-s", "--species"), type="character", default=NULL, 19 | help="species"), 20 | make_option(c("-d", "--taxid"), type="character", default=NULL, 21 | help="Taxonomy ID from NCBI") 22 | ) 23 | 24 | opt_parser = OptionParser(option_list=option_list); 25 | opt = parse_args(opt_parser); 26 | 27 | if (is.null(opt$input)){ 28 | print_help(opt_parser)} 29 | 30 | infile = opt$input 31 | author_name = opt$author 32 | e_mail = opt$mail 33 | Genus = opt$genus 34 | Species = opt$species 35 | Taxid = opt$taxid 36 | 37 | library(tidyverse) 38 | library(AnnotationForge) 39 | emapper <- read.delim(infile) %>% 40 | # mutate(Description = if_else(Description != "-", Description, PFAMs)) %% 41 | dplyr::select(GID = query, Gene_Symbol = Preferred_name, 42 | GO = GOs, KO = KEGG_ko, Pathway = KEGG_Pathway, 43 | OG = eggNOG_OGs, Gene_Name = Description, pfam = PFAMs) 44 | 45 | gene_info <- dplyr::select(emapper,GID,Gene_Name) %>% 46 | dplyr::filter(!is.na(Gene_Name)) 47 | 48 | gene2go <- dplyr::select(emapper,GID,GO) %>% 49 | separate_rows(GO, sep = ",", convert = F) %>% 50 | filter(GO!="NA",!is.na(GO)) %>% 51 | mutate(EVIDENCE = 'A') 52 | 53 | gene2ko<- dplyr::select(emapper,GID,KO) %>% 54 | separate_rows(KO, sep = ",", convert = F) %>% 55 | dplyr::filter(KO!="NA",!is.na(KO)) 56 | 57 | gene2pathway<- dplyr::select(emapper,GID,Pathway) %>% 58 | separate_rows(Pathway, sep = ",", convert = F) %>% 59 | dplyr::filter(!is.na(Pathway)) 60 | 61 | gene2symbol<- dplyr::select(emapper,GID,Gene_Symbol) %>% 62 | dplyr::filter(!is.na(Gene_Symbol)) 63 | 64 | AnnotationForge::makeOrgPackage(gene_info=gene_info, 65 | go=gene2go, 66 | ko=gene2ko, 67 | pathway=gene2pathway, 68 | symbol=gene2symbol, 69 | maintainer=e_mail, 70 | author=author_name, 71 | version="0.1", 72 | outputDir=".", 73 | tax_id=Taxid, 74 | genus=Genus, 75 | species=Species, 76 | goTable = "go") 77 | -------------------------------------------------------------------------------- /deal_gff/agp2gff/agp2gff.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import defaultdict 3 | 4 | def agp(agp_file): 5 | agp_dict = defaultdict(list) 6 | with open(agp_file) as f: 7 | for line_raw in f: 8 | line = line_raw.strip().split() 9 | if line_raw.startswith("#") or line[4] == "N": 10 | continue 11 | hic_scaffold,hic_start,hic_end,*ignore,scaffold,sca_start,sca_end,strand=line 12 | agp_dict[scaffold].append([[int(sca_start),int(sca_end)],hic_scaffold,int(hic_start),strand]) 13 | return agp_dict 14 | 15 | def agp2gff(gff_file,agp_dict): 16 | out_new_gff = open("new_gff.txt","w") 17 | break_gene = open("break_gene.txt","w") 18 | not_in_cprops = open("not_in_cprops.txt","w") 19 | print('##gff-version 3', file=out_new_gff) 20 | with open(gff_file) as f: 21 | flag_gene = 1 22 | for line_raw in f: 23 | if line_raw.startswith('#'): 24 | continue 25 | line = line_raw.strip().split('\t') 26 | scaffold = line[0] 27 | gene_start,gene_end = (int(line[3]),int(line[4])) 28 | gene_strand = line[6] 29 | gene_type = line[2].lower() 30 | if scaffold in agp_dict: 31 | flag = 0 32 | for l in agp_dict[scaffold]: 33 | ScaInAgp_start = l[0][0] 34 | ScaInAgp_end = l[0][1] 35 | if gene_type == "mrna": 36 | flag_gene = 1 37 | if gene_start >= ScaInAgp_start and gene_end <= ScaInAgp_end and flag_gene == 1: 38 | _,hic_scaffold,hic_start,hic_strand = l 39 | line[0] = hic_scaffold 40 | line[6] = "+" if gene_strand == hic_strand else "-" 41 | #如果agp和gff中链的方向相同,则new.gff中应为正链;否则为负链 42 | if hic_strand == "-": 43 | gene_true_start = hic_start + ScaInAgp_end - gene_end 44 | gene_true_end = gene_true_start + gene_end - gene_start 45 | else: 46 | gene_true_start = hic_start + gene_start - ScaInAgp_start 47 | gene_true_end = gene_true_start + gene_end - gene_start 48 | #如果agp文件中为负链,则基因在HiC_scaffold中的起始和终止应正好反向互补,即scaffold的终止位置减去基因的终止位置加上hic的起始位置 49 | line[3] = str(gene_true_start) 50 | line[4] = str(gene_true_end) 51 | print("\t".join(line),file=out_new_gff) 52 | flag = 1 53 | if flag == 0: 54 | if gene_type == "mrna": 55 | flag_gene = 0 56 | print(scaffold,gene_start,gene_end,"is not in ",[l[0] for l in agp_dict[scaffold]],file=break_gene) 57 | # print("Error:","gene is not in",scaffold,str(gene_start),str(gene_end)) 58 | else: 59 | print(scaffold,"is not in cprops",file=not_in_cprops) 60 | # print("Error:",scaffold,"is not in agp file") 61 | 62 | if not sys.argv[1:]: 63 | sys.stderr.write('Usage: {} agp gff\n'.format(__file__)) 64 | sys.exit() 65 | agp_dict=agp(sys.argv[1]) 66 | agp2gff(sys.argv[2],agp_dict) 67 | -------------------------------------------------------------------------------- /deal_fasta/rename/rename.fa.py: -------------------------------------------------------------------------------- 1 | # Usage 2 | import argparse 3 | from argparse import RawTextHelpFormatter 4 | import csv 5 | from Bio import SeqIO 6 | from Bio.SeqRecord import SeqRecord 7 | from io import StringIO 8 | import os 9 | import sys 10 | import re 11 | 12 | # Functions 13 | # Log a message to stderr 14 | def msg(*args, **kwargs): 15 | print(*args, file=sys.stderr, **kwargs) 16 | 17 | # Log an error to stderr and quit with non-zero error code 18 | def err(*args, **kwargs): 19 | msg(*args, **kwargs) 20 | sys.exit(1); 21 | 22 | # Check file exists 23 | def check_file(f): 24 | return os.path.isfile(f) 25 | 26 | # Check if file is in FASTA format 27 | def check_fasta(f): 28 | if not os.path.isfile(f) or os.path.getsize(f) < 1: 29 | return False 30 | with open(f, 'r') as fasta: 31 | if fasta.readline()[0] != '>': # Check if header starts with ">" 32 | return False 33 | for line in fasta: 34 | line = line.strip() 35 | if not line or line[0] == '>': 36 | continue 37 | # if bool(re.search('[^ACTGactgNn?\-]', line)): # Check if there are non-nucleotide characters in sequence 38 | # return False 39 | return True 40 | 41 | def tab2dict(tab, sep): 42 | dict = {} 43 | with open(tab, mode='r') as file: 44 | table = csv.reader(file, delimiter=sep) 45 | for row in table: 46 | dict[row[0]] = row[1] 47 | return dict 48 | 49 | parser = argparse.ArgumentParser( 50 | formatter_class=RawTextHelpFormatter, 51 | description='Rename headers/sequence IDs in multi-FASTA file\n', 52 | usage='\n %(prog)s [--tab new_names.txt] FASTA > new.fasta') 53 | parser.add_argument('fasta', metavar='FASTA', nargs=1, help='original FASTA file') 54 | parser.add_argument('--ids', metavar='FILE', required=True, nargs=1, help='specify tab-separated file with [oldnames] [newnames]') 55 | parser.add_argument('--out', metavar='FILE', nargs=1, help='specify output file (default = stdout)') 56 | parser.add_argument('--version', action='version', version='%(prog)s v0.1') 57 | args = parser.parse_args() 58 | 59 | # Check input/output files 60 | if not check_file(args.fasta[0]): 61 | err('ERROR: Cannot find "{}". Check file exists in the specified directory.'.format(args.fasta[0])) 62 | if not check_fasta(args.fasta[0]): 63 | err('ERROR: Check "{}" is in FASTA format.'.format(args.fasta[0])) 64 | if not check_file(args.ids[0]): 65 | err('ERROR: Cannot find "{}". Check file exists in the specified directory.'.format(args.ids[0])) 66 | if args.out: 67 | if check_file(args.out[0]): 68 | err('ERROR: "{}" already exists.'.format(args.out[0])) 69 | 70 | # Rename leaf nodes 71 | newseqs = [] 72 | new_names = tab2dict(args.ids[0], '\t') 73 | for record in SeqIO.parse(args.fasta[0], 'fasta'): 74 | newid = new_names[record.id] 75 | newseqs.append(SeqRecord(record.seq, id=newid, description='')) 76 | 77 | # Write masked alignment to file or print to stdout 78 | if args.out: 79 | msg('Masked sequences saved to "{}" ... '.format(args.out[0])) 80 | SeqIO.write(newseqs, args.out[0], 'fasta') 81 | else: 82 | seqFILE = StringIO() 83 | SeqIO.write(newseqs, seqFILE, 'fasta') 84 | output = seqFILE.getvalue().rstrip() 85 | print(output) 86 | 87 | sys.exit(0) 88 | -------------------------------------------------------------------------------- /deal_fasta/agp2fa/ragtag_agp2fa.py: -------------------------------------------------------------------------------- 1 | #!/01_software/conda/envs/ragtag/bin/python3.6 2 | 3 | """ 4 | MIT License 5 | 6 | Copyright (c) 2021 Michael Alonge 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | """ 26 | 27 | import sys 28 | import argparse 29 | 30 | import pysam 31 | 32 | from ragtag_utilities.utilities import reverse_complement 33 | from ragtag_utilities.AGPFile import AGPFile 34 | 35 | 36 | def main(): 37 | parser = argparse.ArgumentParser(description="Build sequences in FASTA format from an AGP v2.1 file.", usage="ragtag.py agp2fa ") 38 | parser.add_argument("agp", metavar="", nargs='?', default="", type=str, help="AGP v2.1 file") 39 | parser.add_argument("components", metavar="", nargs='?', default="", type=str, help="component FASTA file (can be uncompressed or bgzipped)") 40 | 41 | args = parser.parse_args() 42 | if not args.agp or not args.components: 43 | parser.print_help() 44 | sys.exit() 45 | 46 | agp_file = args.agp 47 | components_file = args.components 48 | 49 | fai = pysam.FastaFile(components_file) 50 | agp = AGPFile(agp_file, mode="r") 51 | 52 | # Iterate over the lines of the AGP file 53 | prev_obj = None 54 | is_first = True 55 | for agp_line in agp.iterate_lines(): 56 | if agp_line.obj != prev_obj: 57 | if is_first: 58 | print(">" + agp_line.obj) 59 | is_first = False 60 | else: 61 | print("\n>" + agp_line.obj) 62 | 63 | prev_obj = agp_line.obj 64 | 65 | if agp_line.is_gap: 66 | sys.stdout.write("N"*agp_line.gap_len) 67 | else: 68 | if agp_line.orientation != "-": 69 | sys.stdout.write(str(fai.fetch(agp_line.comp, agp_line.comp_beg-1, agp_line.comp_end))) 70 | # sys.stdout.write(reverse_complement(fai.fetch(agp_line.comp, agp_line.comp_beg-1, agp_line.comp_end))) 71 | else: 72 | sys.stdout.write(reverse_complement(str(fai.fetch(agp_line.comp, agp_line.comp_beg-1, agp_line.comp_end)))) 73 | # sys.stdout.write(fai.fetch(agp_line.comp, agp_line.comp_beg-1, agp_line.comp_end)) 74 | 75 | # End the FASTA file with a newline 76 | sys.stdout.write("\n") 77 | fai.close() 78 | 79 | 80 | if __name__ == "__main__": 81 | main() 82 | -------------------------------------------------------------------------------- /transcriptome/Enrich/eggnog-2.1.9.sh: -------------------------------------------------------------------------------- 1 | ## GO and KEGG annotation using diamond by eggnog-mapper 2 | ### 1. make XXX.wego to GO-enrich 3 | ### 2. make XXX.KO to KEGG-enrich 4 | ### 3. make gene symbol annotation file 5 | ### Attention!!! qsub vf>8G 6 | 7 | # source /dellfsqd2/ST_OCEAN/USER/lishuo1/11_env/bashrc-17-1.txt 8 | export PATH="/01_software/eggnog-mapper-2.1.9/eggnogmapper/bin:$PATH" 9 | 10 | query=$PWD/H_moli.bgi.gff.pep 11 | cpu=10 12 | out_name=eggnog-result 13 | output_dir=$PWD 14 | temp_dir=$PWD 15 | database=/01_software/eggnog-mapper-2.1.4-main_spec/data/ 16 | main_script=/01_software/eggnog-mapper-2.1.9/emapper.py 17 | software=diamond ## diamond,mmseqs,hmmer 18 | evalue=0.001 19 | sensmode=ultra-sensitive ## for diamond: fast,mid-sensitive,sensitive,more-sensitive,very-sensitive,ultra-sensitive 20 | 21 | /01_software/miniconda3/bin/python3 $main_script --cpu $cpu --data_dir $database -o $out_name --output_dir $output_dir --temp_dir $temp_dir --override -m $software -i $query --tax_scope auto --target_orthologs all --go_evidence all --pfam_realign none --report_orthologs --decorate_gff yes --evalue ${evalue} --scratch_dir $PWD --sensmode ${sensmode} 22 | 23 | grep -v -e "#" -e "query" *.emapper.annotations | awk -F "\t" '{print $1"\t"$10}' | sed 's/"//g;s/-//g;s/,/\t/g' > anno.wego 24 | grep -v -e "#" *.emapper.annotations | awk -F "\t" '{print $1"\t"$9"\t"$NF"\t"$8}' | sed 's/"//g' > simple.anno 25 | grep -v -e "#" -e "query" *.emapper.annotations | awk -F "\t" '{print $1"\t"$10}' | awk '$2!~/-/' | sed 's/"//g;s/,/;/g;s/\t/,/g' | /lishuo1/00_tools/csvtk unfold -H -f 2 -s ";" | sed 's/,/\t/g' > anno.unfold.wego 26 | grep -v -e "#" -e "query" *.emapper.annotations | awk -F "\t" '{print $1"\t"$12}' | awk '$2!~/-/' | sed 's/ko://1;s/,ko:/;/g;s/\t/,/g' | /lishuo1/00_tools/csvtk unfold -H -f 2 -s ";" | sed 's/,/\t/g' > anno.unfold.KO 27 | grep -v -e "#" -e "query" *.emapper.annotations | awk -F "\t" '{print $1"\t"$12}' | awk '$2!~/-/' | sed 's/ko://1;s/,ko:/,/g' > anno.fold.KO 28 | /01_software/miniconda3/bin/python /01_soft/kofam_scan-1.3.0/kofamscan_plus.py -K /01_soft/kofam_scan-1.3.0/ko00001.keg -i anno.unfold.KO -o kegg.all.xls 29 | awk -F "\t" '{print $1"\t"$4"\t"$5}' kegg.all.xls > gene2pathway.txt 30 | cut -f 1,3 kegg.all.xls | sed '1d;s/; /;/g' | awk -F "\\\[EC" '{print $1}' | awk '!a[$0]++' | /lishuo1/00_tools/csvtk -t fold -H -f 1 -v 2 -s " ||| " | awk -F "\t" 'NR==FNR{a[$1]=$2}NR!=FNR{print $0"\t"a[$1]}' - simple.anno | /lishuo1/00_tools/csvtk -t add-header -n ID,Symbol,Domain,Description,Symbol_KEGG > simple.anno.new 31 | rm simple.anno 32 | 33 | ### 注释结果说明:eggnog-mapper会生成三个文件 34 | ### [project_name].emapper.hmm_hits: 记录每个用于搜索序列对应的所有的显著性的eggNOG Orthologous Groups(OG). 所有标记为"-"则表明该序列未找到可能的OG 35 | ### [project_name].emapper.seed_orthologs: 记录每个用于搜索序列对的的最佳的OG,也就是[project_name].emapper.hmm_hits里选择得分最高的结果。之后会从eggNOG中提取更精细的直系同源关系(orthology relationships) 36 | ### [project_name].emapper.annotations: 该文件提供了最终的注释结果。大部分需要的内容都可以通过写脚本从从提取,一共有13列。[project_name].emapper.annotations每一列对应的记录如下: 37 | ### query_name: 检索的基因名或者其他ID 38 | ### sedd_eggNOG_ortholog: eggNOG中最佳的蛋白匹配 39 | ### seed_orholog_evalue: 最佳匹配的e-value 40 | ### seed_ortolog_evalu: 最佳匹配的bit-score 41 | ### predicted_gene_name: 预测的基因名,特别指的是类似AP2有一定含义的基因名,而不是AT2G17950这类编号 42 | ### GO_term: 推测的GO的词条, 未必最新 43 | ### KEGG_KO: 推测的KEGG KO词条, 未必最新 44 | ### BiGG_Reactions: BiGG代谢反应的预测结果 45 | ### Annotation_tax_scope: 对该序列在分类范围的注释 46 | ### Matching_OGs: 匹配的eggNOG Orthologous Groups 47 | ### best_OG|evalue|score: 最佳匹配的OG(HMM模式才有) 48 | ### COG functional categories: 从最佳匹配的OG中推测出的COG功能分类 49 | ### eggNOG_HMM_model_annotation: 从最佳匹配的OG中推测出eggNOG功能描述 50 | -------------------------------------------------------------------------------- /transcriptome/Enrich/enrich.r: -------------------------------------------------------------------------------- 1 | library(clusterProfiler) 2 | library(org.Trubripes.eg.db) 3 | library(ggplot2) 4 | library(enrichplot) 5 | library(stringr) 6 | args <- commandArgs(T) 7 | dir.create(args[1]) 8 | data <- read.table(args[2],header=F) 9 | setwd(args[1]) 10 | genes <- as.character(data$V1) 11 | ego <- enrichGO(gene = genes, # list of entrez gene id 12 | OrgDb = org.Trubripes.eg.db, # 背景使用分析物种的org包 13 | keyType = 'GID', 14 | ont = "ALL", # "BP", "MF", "CC", "ALL"。GO三个子类。 15 | pAdjustMethod = "BH", # 多重假设检验,"holm", "hochberg", "hommel", "bonferroni", "BY", "fdr" 16 | pvalueCutoff = 0.05, # 富集分析的pvalue,默认是pvalueCutoff = 0.05,更严格可选择0.01 17 | qvalueCutoff = 0.2) # 富集分析显著性的qvalue,默认是qvalueCutoff = 0.2,更严格可选择0.05 18 | go.res <- data.frame(ego) 19 | 20 | goBP <- subset(go.res,subset = (ONTOLOGY == "BP"))[1:15,] 21 | goCC <- subset(go.res,subset = (ONTOLOGY == "CC"))[1:10,] 22 | goMF <- subset(go.res,subset = (ONTOLOGY == "MF"))[1:10,] 23 | go.df <- rbind(goBP,goCC,goMF) 24 | 25 | # 使画出的GO term的顺序与输入一致 26 | go.df$Description <- factor(go.df$Description,levels = rev(go.df$Description)) 27 | go_bar <- ggplot(data = go.df, aes(x = Description, y = -log10(pvalue),fill = ONTOLOGY)) + 28 | geom_bar(stat = "identity",width = 0.9) + coord_flip() + theme_bw() + 29 | scale_x_discrete(labels = function(x) str_wrap(x,width = 50)) + 30 | labs(x = "GO terms",y = "-log10(pvalue)",title = "Barplot of Enriched GO Terms") + 31 | theme(axis.title = element_text(size = 13),axis.text = element_text(size = 11),plot.title = element_text(size = 14,hjust = 0.5,face = "bold"),legend.title = element_text(size = 13),legend.text = element_text(size = 11),plot.margin = unit(c(0.5,0.5,0.5,0.5),"cm")) 32 | 33 | go_bar2 <- ggplot(data = go.df, aes(x = Description, y = Count,fill = ONTOLOGY)) + 34 | geom_bar(stat = "identity",width = 0.9)+ 35 | coord_flip()+theme_bw()+ # 横纵坐标反转及去除背景色 36 | scale_x_discrete(labels = function(x) str_wrap(x,width = 50))+ # 设置term名称过长时换行 37 | labs(x = "GO terms",y = "Gene number",title = "Barplot of Enriched GO Terms")+ # 设置坐标轴标题及标题 38 | theme(axis.title = element_text(size = 13), # 坐标轴标题大小 39 | axis.text = element_text(size = 11), # 坐标轴标签大小 40 | plot.title = element_text(size = 14,hjust = 0.5,face = "bold"), # 标题设置 41 | legend.title = element_text(size = 13), # 图例标题大小 42 | legend.text = element_text(size = 11), # 图例标签大小 43 | plot.margin = unit(c(0.5,0.5,0.5,0.5),"cm")) # 图边距 44 | 45 | pdf("GO_Barplot.pdf",width = 10,height = 10) 46 | go_bar 47 | go_bar2 48 | dev.off() 49 | write.table(go.df,"allGO_gene.xls",sep="\t",quote=F) 50 | 51 | pdf("GO_treeplot.pdf",width=15,height=10) 52 | edox2 <- pairwise_termsim(ego) ###top30BP 53 | treeplot(edox2) 54 | dev.off() 55 | write.table(as.data.frame(edox2)[1:30,],"top30BP_gene.xls",sep="\t",quote=F) 56 | 57 | kegg_anno <- read.table(args[3], sep="\t", header=T) 58 | kegg2gene <- kegg_anno[, c(2, 1)] 59 | kegg2name <- kegg_anno[, c(2, 3)] 60 | kegg <- enricher(genes, TERM2GENE = kegg2gene, TERM2NAME = kegg2name, pAdjustMethod = "BH", pvalueCutoff = 0.05, qvalueCutoff = 0.2) 61 | 62 | pdf("KEGG_Barplot.pdf") 63 | barplot(kegg, showCategory=20, title="Enrichment_KEGG") 64 | dev.off() 65 | write.table(kegg[1:30,],"top30KEGG_gene.xls",sep="\t",quote=F) 66 | pdf("KEGG_treeplot.pdf",width=15,height=10) 67 | edox2 <- pairwise_termsim(kegg) ###top30BP 68 | treeplot(edox2) 69 | dev.off() 70 | -------------------------------------------------------------------------------- /python/msa_4d/readme.md: -------------------------------------------------------------------------------- 1 | ## 主要功能 2 | - 支持三种主流多序列比对工具:PRANK、MUSCLE和MAFFT 3 | - 基于蛋白质序列引导的DNA密码子感知比对 4 | - 自动提取4D兼并位点(四重简并位点) 5 | - 多种重复物种处理策略:基于比对质量、最长序列、首个序列等 6 | - 多种缺失数据处理策略:缺口填充、物种排除、基因排除 7 | - 自动翻译CDS序列到蛋白质序列 8 | - 支持TrimAl修剪比对结果 9 | - 并行处理以提高效率 10 | - 详细的日志和统计信息输出 11 | 12 | ## 完整参数列表 13 | - 必要参数: 14 | --input_dir DIR 包含CDS文件的目录 (默认: ".") 15 | 16 | --output_dir DIR 输出文件目录 (默认: "./output") 17 | 18 | --aligner {prank,muscle,mafft} 使用的比对工具 (默认: "prank") 19 | 20 | --prank_path PATH PRANK可执行文件的绝对路径 (选择prank时必需) 21 | 22 | --muscle_path PATH MUSCLE可执行文件的绝对路径 (选择muscle时必需) 23 | 24 | --mafft_path PATH MAFFT可执行文件的绝对路径 (选择mafft时必需) 25 | 26 | - 常用选项: 27 | --supergene_output FILE 超基因输出文件名 (默认: "supergene_4d.fasta") 28 | 29 | --threads N 并行处理的线程数 (默认: 4) 30 | 31 | --no_codon_aware 禁用密码子感知比对 (默认启用) 32 | 33 | --duplicate_strategy {longest,first,rename,alignment_quality} 处理重复物种的策略 (默认: alignment_quality) 34 | 35 | --skip_existing 如果比对文件已存在则跳过处理 36 | 37 | --min_coverage_pct N 物种必须存在的最低基因百分比 (默认: 50.0%) 38 | 39 | --log_level {DEBUG,INFO,WARNING,ERROR} 40 | 设置日志级别 (默认: INFO) 41 | 42 | - TrimAl相关选项: 43 | --use_trimal 使用TrimAl修剪蛋白质比对 44 | 45 | --trimal_path PATH TrimAl可执行文件的绝对路径 46 | 47 | --trimal_automated 使用TrimAl自动化修剪方法 (默认: True) 48 | 49 | --gap_threshold N TrimAl最小缺口阈值 50 | 51 | --consistency_threshold N TrimAl一致性阈值 52 | 53 | --conservation_threshold N TrimAl保守性阈值 54 | 55 | --trim_supergene 对最终蛋白质超基因应用TrimAl 56 | 57 | - 高级选项: 58 | --f N PRANK插入开放概率 (默认: 0.2) 59 | 60 | --gaprate N PRANK缺口开放率 61 | 62 | --gapext N PRANK缺口扩展概率 63 | 64 | --use_logs 在PRANK中使用对数计算(大数据集) 65 | 66 | --penalize_terminal_gaps 67 | 在PRANK中正常惩罚末端缺口 68 | 69 | --clean_temp 处理后清理临时文件 (默认: True) 70 | 71 | --create_protein_msa 创建蛋白质序列的多序列比对 72 | 73 | ## 主要结果文件说明 74 | 4d_sites/supergene_4d_*.fasta: 4D兼并位点超基因,用于系统发育分析 75 | 76 | full_cds/supergene_full_*.fasta: 完整CDS序列的超基因 77 | 78 | proteins/supergene_protein_*.fasta: 翻译的蛋白质序列超基因 79 | 80 | stats/species_coverage_matrix_*.tsv: 物种/基因覆盖矩阵,显示每个物种在各基因中的存在情况 81 | 82 | ## 输出 83 | output_dir/ 84 | ├── 4d_sites/ # 4D位点序列 85 | │ ├── supergene_4d_gaps.fasta # 用缺口填充策略的超基因 86 | │ ├── supergene_4d_exclude_species.fasta # 排除物种策略的超基因 87 | │ └── supergene_4d_exclude_genes.fasta # 排除基因策略的超基因 88 | ├── alignments/ # 各基因的比对结果 89 | │ ├── gene1.best.fas 90 | │ ├── gene2.best.fas 91 | │ └── ... 92 | ├── full_cds/ # 完整CDS序列的超基因 93 | │ ├── supergene_full_gaps.fasta 94 | │ ├── supergene_full_exclude_species.fasta 95 | │ └── supergene_full_exclude_genes.fasta 96 | ├── proteins/ # 翻译的蛋白质序列 97 | │ ├── supergene_protein_gaps.fasta 98 | │ ├── supergene_protein_exclude_species.fasta 99 | │ └── supergene_protein_exclude_genes.fasta 100 | ├── protein_msa/ # 蛋白质多序列比对结果(如果使用--create_protein_msa,这个参数废弃,结果已经在proteins/ 中) 101 | │ ├── gene1_protein_msa.fasta 102 | │ ├── gene2_protein_msa.fasta 103 | │ ├── supergene_protein_msa_gaps.fasta 104 | │ └── ... 105 | ├── stats/ # 统计信息 106 | │ ├── species_coverage_matrix_gaps.tsv 107 | │ ├── species_coverage_matrix_exclude_species.tsv 108 | │ └── species_coverage_matrix_exclude_genes.tsv 109 | └── temp/ # 临时文件(处理完成后可能被删除) 110 | -------------------------------------------------------------------------------- /picture/GC_depth/depth_gc.r: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(grid) 3 | 4 | #读取文件 5 | depth_gc <- read.delim('Depth_GC.txt') 6 | #seq start end Depth GC 7 | #chr1 0 2000 4.6190000 0.491000 8 | #chr1 2000 4000 15.8795004 0.502000 9 | #chr1 4000 6000 3.4749999 0.501500 10 | #chr1 6000 8000 3.5050001 0.501500 11 | 12 | #GC 含量中位数(百分比) 13 | depth_gc$GC <- 100 * depth_gc$GC 14 | GC_median <- round(median(depth_gc$GC), 2) 15 | 16 | #测序深度中位数 17 | depth_median <- round(median(depth_gc$Depth), 2) 18 | 19 | #为了避免二代测序的 duplication 所致的深度极高值,将高于测序深度中位数 3 倍的数值去除 20 | depth_gc <- subset(depth_gc, Depth <= 3 * depth_median) 21 | 22 | #depth 深度、GC 含量散点密度图 23 | depth_GC <- ggplot(depth_gc, aes(GC, Depth)) + 24 | geom_point(color = 'gray', alpha = 0.6, pch = 19, size = 0.5) + 25 | # geom_vline(xintercept = GC_median, color = 'red', lty = 2, lwd = 0.5) + 26 | # geom_hline(yintercept = depth_median, color = 'red', lty = 2, lwd = 0.5) + 27 | stat_density_2d(aes(fill = ..density.., alpha = ..density..), geom = 'tile', contour = FALSE, n = 500) + 28 | scale_fill_gradientn(colors = c('transparent', 'gray', 'blue', 'red')) + 29 | theme(panel.grid.major = element_line(color = 'gray', linetype = 2, size = 0.25), panel.background = element_rect(color = 'black', fill = 'transparent')) + 30 | labs(x = paste('GC % (Median :', GC_median, '%)'), y = paste('Depth (Median :', depth_median, 'X)')) + 31 | theme(axis.text = element_text(size = 8), axis.title = element_text(size = 12)) + 32 | theme(legend.position = 'none') 33 | 34 | #depth 深度频数直方图 35 | depth_hist <- ggplot(depth_gc, aes(Depth)) + 36 | geom_histogram(binwidth = (max(depth_gc$Depth) - min(depth_gc$Depth))/100, fill = 'gray', color = 'gray40', size = 0.1) + 37 | geom_rug(color = 'gray', alpha = 0.6) + 38 | theme(panel.grid.major = element_line(color = 'gray', linetype = 2, size = 0.25), panel.background = element_rect(color = 'black', fill = 'transparent')) + 39 | theme(axis.line = element_line(color = 'black', size = 0.3), axis.text = element_text(size = 8), axis.title = element_text(size = 12)) + 40 | labs(x = '', y = 'Numbers') + 41 | coord_flip() 42 | # geom_vline(xintercept = depth_median, color = 'red', lty = 2, lwd = 0.5) 43 | 44 | #GC 含量频数直方图 45 | GC_hist <- ggplot(depth_gc, aes(GC)) + 46 | geom_histogram(binwidth = (max(depth_gc$GC) - min(depth_gc$GC))/100, fill = 'gray', color = 'gray40', size = 0.1) + 47 | geom_rug(color = 'gray', alpha = 0.6) + 48 | theme(panel.grid.major = element_line(color = 'gray', linetype = 2, size = 0.25), panel.background = element_rect(color = 'black', fill = 'transparent')) + 49 | theme(axis.line = element_line(color = 'black', size = 0.3), axis.text = element_text(size = 10), axis.title = element_text(size = 12)) + 50 | labs(x = '', y = 'Numbers') 51 | # geom_vline(xintercept = GC_median, color = 'red', lty = 2, lwd = 0.5) 52 | 53 | #组合图片并输出 54 | #pdf('GC_Depth.pdf', width = 8, height = 8) 55 | # grid.newpage() 56 | # pushViewport(viewport(layout = grid.layout(3, 3))) 57 | # print(depth_GC, vp = viewport(layout.pos.row = 2:3, layout.pos.col = 1:2)) 58 | # print(GC_hist, vp = viewport(layout.pos.row = 1, layout.pos.col = 1:2)) 59 | # print(depth_hist, vp = viewport(layout.pos.row = 2:3, layout.pos.col = 3)) 60 | #dev.off() 61 | 62 | png('GC_Depth.png', width = 4000, height = 4000, res = 600, units = 'px') 63 | grid.newpage() 64 | pushViewport(viewport(layout = grid.layout(3, 3))) 65 | print(depth_GC, vp = viewport(layout.pos.row = 2:3, layout.pos.col = 1:2)) 66 | print(GC_hist, vp = viewport(layout.pos.row = 1, layout.pos.col = 1:2)) 67 | print(depth_hist, vp = viewport(layout.pos.row = 2:3, layout.pos.col = 3)) 68 | dev.off() 69 | -------------------------------------------------------------------------------- /genome/Anno_EGAPx/deal_egapx.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set +o posix 3 | 4 | species=C_nasu 5 | egapx_gff=complete.genomic.gff 6 | pep=complete.proteins.faa 7 | cds=complete.cds.fna 8 | 9 | ## deal feature 10 | [ -d stat_feature ] || mkdir stat_feature 11 | grep "pseudo=true" ${egapx_gff} | awk '$3=="pseudogene"' | cut -f 1,4,5,7,9 | sed 's/;/\t/g' | sed 's/ID=gene-//1;s/Name=//1;s/description=//1' | cut -f 1-5,7 > stat_feature/pseudogene.txt 12 | grep "gene_biotype=lncRNA" ${egapx_gff} | cut -f 1,4,5,7,9 | sed 's/;/\t/g' | cut -f 1-5 | sed 's/ID=gene-//g' > stat_feature/lncRNA.txt 13 | grep "gene_biotype=V_segment" ${egapx_gff} | cut -f 1,4,5,7,9 | sed 's/;/\t/g' | sed 's/ID=gene-//1;s/Name=//1;s/description=//1' | cut -f 1-5,7 > stat_feature/V_segment.txt 14 | grep "gene_biotype=C_region" ${egapx_gff} | cut -f 1,4,5,7,9 | sed 's/;/\t/g' | sed 's/ID=gene-//1;s/Name=//1;s/description=//1' | cut -f 1-5,7 > stat_feature/C_region.txt 15 | grep misc_RNA ${egapx_gff} | awk '$3=="transcript"' | cut -f 1,4,5,7,9 | sed 's/;/\t/g' | sed 's/ID=//1' > stat_feature/misc.txt 16 | cat stat_feature/*txt | cut -f 5 | awk '!a[$0]++' | sed '/^s*$/d' | grep -v -f - ${egapx_gff} | awk '$3=="gene"' | cut -f 1,4,5,7,9 | sed 's/;/\t/g' | awk -F "\t" '{if ($7~/description/) print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7; else if ($8~/description/) print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$7"\t"$8; else print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$5"\tuncharacterized protein" }' | sed 's/ID=gene-//g;s/Name=//g;s/description=//g' > stat_feature/protein.txt 17 | 18 | ## get longest and contain UTR gff file 19 | cut -f 5 stat_feature/protein.txt | grep -f - ${egapx_gff} | grep -v -e "pseudo=true" -e "gene_biotype=lncRNA" -e "gene_biotype=V_segment" -e "gene_biotype=C_region" -e "gbkey=ncRNA" -e "misc_RNA" > pep.gff 20 | /usr/bin/singularity run --bind $PWD:$PWD /dellfsqd2/ST_OCEAN/USER/lishuo11/01_soft/singularity_all/Agata.sif agat_sp_keep_longest_isoform.pl -gff pep.gff -o ${species}.UTR.gff 21 | rm pep.gff pep.agat.log 22 | 23 | ## get simple gff cds pep 24 | toBGI.py ${species}.UTR.gff > bgi.gff 25 | sed 's/|/\t/g' bgi.gff | awk '{print $NF"\t"$0}' | sed 's/-R/-P/1' | awk '{if ($4=="mRNA") print $2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\tID="$1; else print $2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\tParent="$1}' > bgi.rename.gff 26 | fix_mRNA_coordinate.pl bgi.rename.gff bgi.fix.gff 27 | fix_phase.py bgi.fix.gff 28 | nr_gff.pl --direction F mrna_bgi.fix.gff 29 | mv mrna_bgi.fix.gff.nr.gff ${species}.bgi.gff 30 | awk '{print $1}' ${pep} | sed 's/|/\t/g' | awk '{if ($1~/>/) print ">"$NF; else print $0}' > ${pep}.deal 31 | seqkit seq -w 0 ${cds} | sed 's/protein_id=/\t/g' | awk -F "\t" '{if ($1~/>/) print ">"$2; else print $0}' | sed 's/\]/\t/g' | awk '{print $1}' > ${cds}.deal 32 | awk '$3=="mRNA"' ${species}.bgi.gff | cut -f 9 | sed 's/ID=//g;s/;//g' | seqtk subseq ${pep}.deal - | awk '{print $1}' > ${species}.bgi.gff.pep 33 | awk '$3=="mRNA"' ${species}.bgi.gff | cut -f 9 | sed 's/ID=//g;s/;//g' | seqtk subseq ${cds}.deal - | awk '{print $1}' > ${species}.bgi.gff.cds 34 | rm bgi.gff bgi.gff bgi.fix.gff mrna_bgi.fix.gff mrna_bgi.fix.gff.cluster mrna_bgi.fix.gff.uncluster ${cds}.deal ${pep}.deal bgi.rename.gff 35 | 36 | ## get anno from egapx 37 | grep ">" ${pep} | sed 's/|/\t/g' | cut -f 3- | sed 's/ /\t/1' > stat_feature/egapx.anno.txt 38 | diamond makedb --in ${pep} -d complete.proteins 39 | diamond blastp --db complete.proteins --query ${species}.bgi.gff.pep --out result.txt --outfmt 6 qseqid qstart qend sseqid sstart send qlen slen length pident evalue --more-sensitive --max-target-seqs 5 --evalue 1e-3 --id 50 --tmpdir ./ --threads 10 40 | awk '$10=="100"' result.txt | awk '!a[$1]++{print $0}' | awk '{print $4"\t"$1}' | sed 's/|/\t/g' | awk '{print $(NF-1)"\t"$NF}' | awk 'NR==FNR{a[$1]=$0}NR!=FNR{print $0"\t"a[$1]}' stat_feature/egapx.anno.txt - | cut -f 2,4 > egapx.anno.tsv 41 | rm result.txt complete.proteins.dmnd 42 | -------------------------------------------------------------------------------- /deal_fasta/fa2phy/fa2phy.v2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | FASTA到PHYLIP格式转换工具 - 性能优化版 5 | 支持处理大型序列比对文件,Python 3版本 6 | """ 7 | 8 | import sys 9 | import os 10 | import argparse 11 | from datetime import datetime 12 | 13 | 14 | # 简单用法信息 15 | USAGE = """ 16 | FASTA转PHYLIP格式转换工具 17 | 18 | 用法: fa2phy.py [input FASTA] [output PHYLIP] 19 | 20 | 示例: fa2phy.py input.fasta output.phy 21 | """ 22 | 23 | def msg(*args, **kwargs): 24 | """向stderr输出信息""" 25 | print(*args, file=sys.stderr, **kwargs) 26 | 27 | def err(*args, **kwargs): 28 | """向stderr输出错误信息并退出""" 29 | msg("ERROR:", *args, **kwargs) 30 | sys.exit(1) 31 | 32 | def parse_fasta_efficient(filename): 33 | """ 34 | 高效解析FASTA文件 35 | 使用生成器避免将整个文件加载到内存 36 | """ 37 | sequence_dict = {} 38 | sequence_list = [] # 保持序列顺序 39 | current_id = None 40 | 41 | try: 42 | with open(filename, 'r') as fh: 43 | seq_parts = [] # 收集序列片段 44 | 45 | for line in fh: 46 | line = line.rstrip() 47 | if not line: # 跳过空行 48 | continue 49 | 50 | if line[0] == '>': 51 | # 处理前一个序列 52 | if current_id: 53 | sequence_dict[current_id] = ''.join(seq_parts) 54 | 55 | # 提取新序列ID 56 | header = line[1:].strip() 57 | current_id = header.split()[0] 58 | sequence_list.append(current_id) 59 | seq_parts = [] # 重置序列片段列表 60 | else: 61 | seq_parts.append(line) 62 | 63 | # 处理最后一个序列 64 | if current_id and seq_parts: 65 | sequence_dict[current_id] = ''.join(seq_parts) 66 | 67 | return sequence_dict, sequence_list 68 | 69 | except Exception as e: 70 | err(f"解析FASTA文件时出错: {e}") 71 | 72 | def write_phylip(sequence_dict, sequence_list, outfile): 73 | """ 74 | 高效写入PHYLIP格式文件 75 | """ 76 | # 检查比对长度 77 | alignment_length = 0 78 | for gene in sequence_dict: 79 | if alignment_length == 0: 80 | alignment_length = len(sequence_dict[gene]) 81 | elif len(sequence_dict[gene]) != alignment_length: 82 | err(f"比对长度错误: {gene}序列长度({len(sequence_dict[gene])})与其他序列长度({alignment_length})不一致") 83 | 84 | # 找出最长的序列ID 85 | if sequence_list: 86 | longest_id_len = max(len(id) for id in sequence_list) 87 | else: 88 | err("没有找到有效的序列") 89 | 90 | # 写入PHYLIP文件 91 | try: 92 | with open(outfile, "w") as phyfile: 93 | # 写入序列数量和比对长度 94 | phyfile.write(f"{len(sequence_dict)} {alignment_length}\n") 95 | 96 | # 写入序列 97 | for gene in sequence_list: 98 | phyfile.write(f"{gene.ljust(longest_id_len)} {sequence_dict[gene]}\n") 99 | 100 | msg(f"成功写入PHYLIP文件: {outfile}") 101 | msg(f" 序列数量: {len(sequence_dict)}") 102 | msg(f" 比对长度: {alignment_length}") 103 | 104 | except Exception as e: 105 | err(f"写入PHYLIP文件时出错: {e}") 106 | 107 | def main(): 108 | """主函数""" 109 | # 处理命令行参数 110 | if len(sys.argv) != 3: 111 | print(USAGE) 112 | sys.exit(0) 113 | 114 | fasta_file = sys.argv[1] 115 | phylip_file = sys.argv[2] 116 | 117 | # 验证输入文件 118 | if not os.path.isfile(fasta_file): 119 | err(f"找不到输入文件: {fasta_file}") 120 | 121 | if os.path.exists(phylip_file): 122 | msg(f"警告: 输出文件 {phylip_file} 已存在,将被覆盖") 123 | 124 | # 解析FASTA文件 125 | sequence_dict, sequence_list = parse_fasta_efficient(fasta_file) 126 | 127 | # 写入PHYLIP文件 128 | write_phylip(sequence_dict, sequence_list, phylip_file) 129 | 130 | return 0 131 | 132 | if __name__ == "__main__": 133 | sys.exit(main()) 134 | -------------------------------------------------------------------------------- /Comparative_genomics/Domain_predict/rpsblast.sh: -------------------------------------------------------------------------------- 1 | pep=all.deal.pep 2 | cpu=38 3 | evalue=0.01 4 | database_dir=/06_database/CDD_db 5 | 6 | ### for i in {CdD,Tigr,Prk,Pfam,Kog,Cog,Cdd_NCBI,ncbi.cdd}; do grep -e "pep=" -e "cpu=" -e "evalue=" -e "database_dir=" -e ${i} rpsblast.sh > rpsblast.${i}.sh; done 7 | 8 | rpsblast -query ${pep} -outfmt 6 -evalue ${evalue} -db ${database_dir}/Cdd -out CdD_${pep}.txt -num_threads ${cpu} 9 | rpsblast -query ${pep} -outfmt 6 -evalue ${evalue} -db ${database_dir}/Tigr -out Tigr_${pep}.txt -num_threads ${cpu} 10 | rpsblast -query ${pep} -outfmt 6 -evalue ${evalue} -db ${database_dir}/Prk -out Prk_${pep}.txt -num_threads ${cpu} 11 | rpsblast -query ${pep} -outfmt 6 -evalue ${evalue} -db ${database_dir}/Pfam -out Pfam_${pep}.txt -num_threads ${cpu} 12 | rpsblast -query ${pep} -outfmt 6 -evalue ${evalue} -db ${database_dir}/Kog -out Kog_${pep}.txt -num_threads ${cpu} 13 | rpsblast -query ${pep} -outfmt 6 -evalue ${evalue} -db ${database_dir}/Cog -out Cog_${pep}.txt -num_threads ${cpu} 14 | rpsblast -query ${pep} -outfmt 6 -evalue ${evalue} -db ${database_dir}/Cdd_NCBI -out Cdd_NCBI_${pep}.txt -num_threads ${cpu} 15 | rpsblast -query ${pep} -outfmt 6 -evalue ${evalue} -db ${database_dir}/ncbi.cdd -out ncbi.cdd_${pep}.txt -num_threads ${cpu} 16 | 17 | cat CdD_${pep}.txt | awk '{if ($7<$8) print $1"\t"$7"\t"$8"\t"$0; else print $1"\t"$8"\t"$7"\t"$0}' | sort -k 1,1 -k 2n,2 | bedtools cluster -i - | sort -k 16n,16 -k 15nr,15 | awk '!a[$16]++{print $0}' | csvtk cut -t -f 5,1,2,3 | sed 's/CDD://g' | awk 'NR==FNR{a[$1]=$0}NR!=FNR{print $0"\t"a[$1]}' ${database_dir}/cddid_all.tbl - | cut -f 2-4,6- > Cdd.dom 18 | cat Tigr*txt | awk '{if ($7<$8) print $1"\t"$7"\t"$8"\t"$0; else print $1"\t"$8"\t"$7"\t"$0}' | sort -k 1,1 -k 2n,2 | bedtools cluster -i - | sort -k 16n,16 -k 15nr,15 | awk '!a[$16]++{print $0}' | csvtk cut -t -f 5,1,2,3 | sed 's/CDD://g' | awk 'NR==FNR{a[$1]=$0}NR!=FNR{print $0"\t"a[$1]}' ${database_dir}/cddid_all.tbl - | cut -f 2-4,6- > Tigr.dom 19 | cat Prk*txt | awk '{if ($7<$8) print $1"\t"$7"\t"$8"\t"$0; else print $1"\t"$8"\t"$7"\t"$0}' | sort -k 1,1 -k 2n,2 | bedtools cluster -i - | sort -k 16n,16 -k 15nr,15 | awk '!a[$16]++{print $0}' | csvtk cut -t -f 5,1,2,3 | sed 's/CDD://g' | awk 'NR==FNR{a[$1]=$0}NR!=FNR{print $0"\t"a[$1]}' ${database_dir}/cddid_all.tbl - | cut -f 2-4,6- > Prk.dom 20 | cat Pfam*txt | awk '{if ($7<$8) print $1"\t"$7"\t"$8"\t"$0; else print $1"\t"$8"\t"$7"\t"$0}' | sort -k 1,1 -k 2n,2 | bedtools cluster -i - | sort -k 16n,16 -k 15nr,15 | awk '!a[$16]++{print $0}' | csvtk cut -t -f 5,1,2,3 | sed 's/CDD://g' | awk 'NR==FNR{a[$1]=$0}NR!=FNR{print $0"\t"a[$1]}' ${database_dir}/cddid_all.tbl - | cut -f 2-4,6- > Pfam.dom 21 | cat Kog*txt | awk '{if ($7<$8) print $1"\t"$7"\t"$8"\t"$0; else print $1"\t"$8"\t"$7"\t"$0}' | sort -k 1,1 -k 2n,2 | bedtools cluster -i - | sort -k 16n,16 -k 15nr,15 | awk '!a[$16]++{print $0}' | csvtk cut -t -f 5,1,2,3 | sed 's/CDD://g' | awk 'NR==FNR{a[$1]=$0}NR!=FNR{print $0"\t"a[$1]}' ${database_dir}/cddid_all.tbl - | cut -f 2-4,6- > Kog.dom 22 | cat Cog*txt | awk '{if ($7<$8) print $1"\t"$7"\t"$8"\t"$0; else print $1"\t"$8"\t"$7"\t"$0}' | sort -k 1,1 -k 2n,2 | bedtools cluster -i - | sort -k 16n,16 -k 15nr,15 | awk '!a[$16]++{print $0}' | csvtk cut -t -f 5,1,2,3 | sed 's/CDD://g' | awk 'NR==FNR{a[$1]=$0}NR!=FNR{print $0"\t"a[$1]}' ${database_dir}/cddid_all.tbl - | cut -f 2-4,6- > Cog.dom 23 | cat Cdd_NCBI*txt | awk '{if ($7<$8) print $1"\t"$7"\t"$8"\t"$0; else print $1"\t"$8"\t"$7"\t"$0}' | sort -k 1,1 -k 2n,2 | bedtools cluster -i - | sort -k 16n,16 -k 15nr,15 | awk '!a[$16]++{print $0}' | csvtk cut -t -f 5,1,2,3 | sed 's/CDD://g' | awk 'NR==FNR{a[$1]=$0}NR!=FNR{print $0"\t"a[$1]}' ${database_dir}/cddid_all.tbl - | cut -f 2-4,6- > Cdd_NCBI.dom 24 | cat ncbi.cdd*txt | awk '{if ($7<$8) print $1"\t"$7"\t"$8"\t"$0; else print $1"\t"$8"\t"$7"\t"$0}' | sort -k 1,1 -k 2n,2 | bedtools cluster -i - | sort -k 16n,16 -k 15nr,15 | awk '!a[$16]++{print $0}' | csvtk cut -t -f 5,1,2,3 | sed 's/CDD://g' | awk 'NR==FNR{a[$1]=$0}NR!=FNR{print $0"\t"a[$1]}' ${database_dir}/cddid_all.tbl - | cut -f 2-4,6- > ncbi.cdd.dom 25 | --------------------------------------------------------------------------------