├── .gitignore ├── ColLink.pl ├── GBS ├── ext_rd_withAPEKI.pl └── sep_rd_by_keyFile.pl ├── LICENSE ├── MyPM ├── ConfigSunhh.pm ├── LogInforSunhh.pm ├── ReadInAlnSunhh.pm ├── ReadInSeqSunhh.pm ├── SNP_tbl.pm ├── SNP_tbl.pm_bk ├── SeqAlnSunhh.pm ├── fastaSunhh.pm ├── fileSunhh.pm ├── fromBraker.pm ├── gffSunhh.pm ├── mathSunhh.pm ├── mcsSunhh.pm ├── plotSunhh.pm └── wm97Sunhh.pm ├── README.md ├── annot_tools ├── PASA │ ├── Launch_PASA_pipeline.pl.edit │ └── build_comprehensive_transcriptome.dbi ├── add_tag_to_gffID.pl ├── ahrd │ ├── annot.cfg │ ├── cmd_list │ ├── generate_ahrd_yml.sh │ ├── generate_ahrd_yml_fromCDS.sh │ ├── pipe_for_functional_annotation.pl │ └── trim_orphan_right_brack.pl ├── annot_mapEst2genome.sh ├── annot_mapProt2genome.sh ├── annot_mapRNA2genome.sh ├── augustus.accuracy_calculator.pl ├── b2g_graph_software │ └── fmt_b2g_enrich.pl ├── blast_xml_parse.py ├── clean_pasa_med_files.pl ├── cmd_list_run_b2g ├── cnvt_infernal_tbl.pl ├── cnvt_maker2aug_gff3.pl ├── cnvt_spaln2makerAln_prot_gff3.pl ├── cnvt_uniprot_dat2fa.pl ├── copy_species.pl ├── createAugustusJoblist.pl ├── evm_tools │ ├── chk_pcTE_ahrd │ ├── cmd_list │ ├── cmd_list_finalize_gene_ID │ ├── cmd_list_old │ ├── cnvt_maker2evmProtGff3.pl │ ├── evm_weight.txt │ ├── filter_cds2Bad_bn6.pl │ ├── fit_evm_pred.pl │ ├── get_longCDS_in_gff.pl │ ├── good_desc │ ├── list.gnPref_mkPref_gnFa_gff │ ├── list.toAnnot │ ├── param.cfg │ ├── param_list │ ├── pipe_revise_byEVM.pl │ ├── run_evm_21TJS6_wiEVMpipe.sh │ ├── run_evm_21TJS6_woEVMpipe.sh │ └── run_evm_wiEVMpipe.sh ├── example_pipes │ ├── bad_prot_IDs │ ├── cmd_list_protAln_generate │ ├── cmd_list_protAln_single_1 │ ├── cmd_list_protAln_single_1add │ ├── cmd_list_transAln_generate │ └── cmd_list_transAln_single_1 ├── find_complete_prot_byBlastp.pl ├── fitGff_aug2maker.pl ├── fix_1bpLoc_by_zff2Gb.pl ├── genemark │ └── get_intron_hints_fromGff.pl ├── get_gff_byScfID.pl ├── get_maker_result.sh ├── get_sameCDSGff.pl ├── gff3_preds2models ├── gff3_select ├── go │ ├── cnvt_GOobo_to_detail_list.pl │ ├── generate_topGO_gene2GO_list.pl │ └── mk_GAF2.0.pl ├── good_model_from_gff3.pl ├── intron2exex.pl ├── iprscan │ ├── cmd_list │ ├── cnt_TEIPRacc.pl │ ├── cnvt_iprJson2tbl.pl │ ├── converter_iprV4.pl │ ├── in.ipr.tsv │ ├── list_IPRacc.pl │ └── potential_TE_IPRacc ├── join_b2g_annot.pl ├── keep_nonRedundant_list.pl ├── kegg │ ├── 1_extract_KeggMapRes.pl │ ├── 2_join_mapIDs.pl │ ├── cnvt_keggPWYReconstruction_to_tab.pl │ ├── list1.keggMap.txt │ ├── list1.keggMap.txt.tbl.comb │ └── record ├── liftoff_tools │ ├── blk2bed.pl │ ├── blk2gff.pl │ ├── chk_only_pan.pl │ ├── cnt_R2Q_liftoff_info.pl │ ├── cnvt_genePAV_to_grpPAV.pl │ ├── cnvt_genemap_to_QlocBlk.pl │ ├── cnvt_gff_to_cdsBed.pl │ ├── filter_R2Q_liftoff_tbl.pl │ ├── fit_gff_4igv.pl │ ├── fmt_gff_trim2CDS.pl │ ├── fmt_grp_by_spec.pl │ ├── grp2single.pl │ ├── info_bedtools_intersect.pl │ ├── info_liftoffGff.pl │ ├── prepare_gff3_to_blk.pl │ ├── prepare_input.pl │ ├── psl_to_geneLoc.pl │ ├── remove_ovl_loc.pl │ ├── ret_good_model_from_liftoff_gff3.pl │ ├── retrieve_QlocSeq.pl │ ├── retrieve_QlocSeq_fromBlk.pl │ ├── retrieve_liftoff_pairs.pl │ ├── rm_head_partial_frame_inGff.pl │ ├── run_liftoff_FromTo.pl │ ├── run_liftoff_FromTo_pan.pl │ ├── run_liftoff_FromTo_pan_para.pl │ ├── simple_group_pairs.pl │ └── slct_best_psl.pl ├── maker │ ├── add_ipr.sh │ ├── ret_ipr_nonovl_genes.pl │ ├── ret_makerGff_fromAbGff.pl │ ├── ret_maker_abinit_gff3.pl │ └── rm_maker_fasta.pl ├── merge_blast_xml.pl ├── mkCmd_blast2Nr.sh ├── pasa │ ├── alignAssembly.config │ └── pair_ovlp.pl ├── pasa_gff_to_alnGff.pl ├── pipe_get_complete_models.pl ├── predictByAug_rna2genome.pl ├── protein │ ├── cnvt_spalnGff_to_infoTab.pl │ └── trimProt4spaln.pl ├── rename_by_GffJnLoc.pl ├── repAnno_tools │ ├── LTR_dist_est.mao │ ├── ProtExcluder1.1 │ │ ├── GCcontent.pl │ │ ├── Installer.pl │ │ ├── ProtExcluder.npl │ │ ├── ProtExcluder.pl │ │ ├── blastformatProt.pl │ │ ├── countaanu.pl │ │ ├── fasta-reformat.pl │ │ ├── getanycolumnuni.pl │ │ ├── matchtract.pl │ │ ├── mergequeryBF.pl │ │ ├── mergeunmatchedregion.pl │ │ ├── mspesl-sfetch.npl │ │ ├── mspesl-sfetch.pl │ │ ├── rmlistedseq.pl │ │ ├── rmlowcomfromBF.pl │ │ ├── rmlowcomplexitymathc.pl │ │ ├── rmshortseq_noN.pl │ │ └── unmatchedregionBF.pl │ ├── add_repClass.pl │ ├── build_Examplar_byFa.pl │ ├── build_Examplar_byFa.pl_bak │ ├── ch_gff_to_tab.pl │ ├── ch_seqID.pl │ ├── cmd_list_forFinalRepDB │ ├── cnt_ltr_dist.pl │ ├── detect_centromere │ │ ├── cnt_CLfreqInChr.pl │ │ └── get_candidate_cent_from_whole.pl │ ├── filter_RepMsk_out.pl │ ├── filter_flank.pl │ ├── filter_gff.pl │ ├── filter_tab_byPBSPPT.pl │ ├── fit_rmOut_to_makerGff.pl │ ├── get_LTR_wi_Termi.pl │ ├── lis_masked_RepMsk_out.pl │ ├── mk_outID.pl │ ├── muscle3.8.31_i86linux64 │ ├── name_from_tab.pl │ ├── path.conf │ ├── pipe_ltr85.pl │ ├── pipe_ltr99.pl │ ├── pipe_trim85.pl │ ├── pipe_trim99.pl │ ├── rmOutToGFF3_with_TEclass.pl │ ├── rm_geneFrag.pl │ ├── run_MITE.sh │ ├── run_repClass.pl │ ├── run_repClass_ltr.sh │ ├── run_repClass_mite.sh │ ├── run_repeatmasker.sh │ ├── run_rm_GF.sh │ ├── run_rpmd.sh │ └── seqID_to_scaf.pl ├── replace_blast_asn_db.pl ├── rmRedunt_inputNucl.pl ├── rmRedunt_inputProt.pl ├── rmRepGff_withBadTarget.pl ├── rmShrtExon_spaln_prot2genom.pl ├── rm_ExactDup_gene_model.pl ├── rm_ovlap_gene_model.pl ├── run_maker.sh ├── run_spaln_prot2genom.pl ├── satisfied_prot.sh ├── set_stopCodonFreq.pl ├── simplify_gff3.pl ├── slct_maker_gff3.pl ├── snap_good_wrn_by_valid.pl ├── tRNA │ ├── cnvt_trnascan_to_table.pl │ ├── cnvt_trnascanse_to_boxStat.pl │ ├── filter_trnascanSE_result.pl │ └── stat_trnaFreq.pl ├── train_augustus_goodTrain.sh ├── train_augustus_goodTrain_v2.sh ├── train_snap1.sh ├── trim_orphan_right_brack.pl └── zff2augustus_gbk.pl ├── assemble_tools ├── LAI │ ├── cmd_list │ ├── pepper_genome.fa.chrID │ ├── pepper_genome.fa.out.LAI_unlock.gz │ ├── pepper_genome.fa.out.LAI_unlock_distr_byChr.pdf │ ├── pepper_genome.fa.out.LAI_wiDefault │ ├── pepper_genome.fa.tbl │ ├── plot_LAI_id85.R │ ├── scrn.para_LAI.gz │ └── scrn.prepare_genometools ├── add_1bp_ctg_toAGP_jcviPlot.pl ├── add_dep_to_chopInf.pl ├── add_tag_to_fsa.pl ├── bionano │ ├── rename_xmap.pl │ └── sv_related_xmap.pl ├── brk_fas_by_FRBadWind_for15kb.pl ├── busco │ ├── cmd_list │ ├── geneCopyN_busco_full_table.pl │ ├── rm_busco_intermediate_files.pl │ └── summary_busco_full_table.pl ├── calc_ident_from_sam2pairwise.pl ├── classify_tools │ ├── classify_region_byBn6.pl │ ├── cmd_list │ ├── cnt_In_bp.pl │ ├── get_Ex_region.pl │ ├── line_bn6_query.pl │ ├── recog_organelle_rDNA_from_classJn.pl │ ├── run_seg_blastn.pl │ └── temporary_prepare_rmCont.pl ├── clip_scaf_end.pl ├── cmd_batch_for_mugsy ├── cnvt_loc_fromAGP_toAGP_forLoci.pl ├── cnvt_loci_97_v1_to_vRILs.pl ├── cnvt_quast_unaligned_info_to_tbl.pl ├── ctgBn6_to_scfCov.pl ├── deduplicate_ncpu.pl ├── draw_aln_from_bam.pl ├── estimate_gap_size.pl ├── extract_ctg_from_scaf.pl ├── fill_SingleNgap.pl ├── filter_dropGap.pl ├── format_maf_forMugsy.pl ├── funcs_for_compare_genome.r ├── get_frag_cov.pl ├── get_paired_maf.pl ├── get_rep_loc_fromPileup.pl ├── good_link_fromMaf.pl ├── grp_maf_byLinkage.pl ├── hifi_hic │ ├── cnvt_gfa2fa.pl │ ├── cnvt_num2tigID.pl │ ├── convert_3ddna_assembly_to_agp.pl │ └── get_HiCanu_ctg.pl ├── high_tandem_repeat │ └── count_kmer_distr.pl ├── hist_plot_ins.R ├── join_link.pl ├── kmer │ ├── cmd_list │ ├── cnvt_qualFa_to_wind_avgTbl.pl │ ├── extract_genomescope_summary.pl │ ├── get_kmer_by_seq.pl │ ├── get_kmer_by_seq_summary.pl │ └── get_seq_by_kmer.pl ├── lift_bed_jcviPlot.pl ├── link_scf2chr.pl ├── link_seq_by_agp.pl ├── list_run_last2scaf.pl ├── loc_repeat.pl ├── maf2fasta.pl ├── map_ctg_to_scf │ ├── cmd_list │ ├── simple_fill.pl │ └── stat_ragtag_agp.pl ├── mk_bed_from_agp.pl ├── mk_hapmap_from_SNPtbl.pl ├── mk_wind_for_INScnt.pl ├── order_scf_1.pl ├── pacBio_tools │ └── infor_PBjelly_assembly_chunk_err.pl ├── plot_kmer_prop_along_chr.r ├── resize_N_in_agp.pl ├── rmRed_byIdentCov_InCtg.pl ├── rm_redundant_loci.pl ├── run_last_to_scaffold.sh ├── run_mugsy_MP.pl ├── scf2LG_to_AGP.pl └── slct_pe.pl ├── bp0_2_bp6.pl ├── calc_est_in_psl.pl ├── cmd_ctrl ├── kill_by_ppid.pl ├── log_func.sh ├── rm_list.pl ├── run_cmd_in_batch.sh ├── split_scrn_time.pl ├── wait_for_pid.pl └── wrap_sh.pl ├── cnvrt_embl_to_fasta.pl ├── deal_augustus.pl ├── deal_bnx.pl ├── deal_fasta.pl ├── deal_fastq.pl ├── deal_iprscan.pl ├── deal_table.pl ├── drop_dup_both_end.pl ├── drop_dup_single.pl ├── enrich ├── README.md ├── cmd_list ├── example_data │ ├── KEGG_PWY.txt │ ├── P1Genom_V1p2.ipr_background │ ├── combined_KO_pwy.txt.tbl.ma.bg │ ├── example-GOen-testLg10.svg │ ├── example-GOen.svg │ ├── example-GOen.tsv │ ├── goslim_plant.obo.20181129 │ ├── goslim_plant.obo.20181129.tab │ ├── in_geneID-IPRen-testLg10.svg │ ├── in_geneID-IPRen.svg │ ├── in_geneID-IPRen.tsv │ ├── in_geneID.list │ ├── in_whole_genome_gene-ghostKID.tsv │ ├── in_whole_genome_gene.annot │ ├── in_whole_genome_gene.annot-GOinEnrich │ ├── keggPWYByKO_KOenrich_bg │ ├── keggPWYByKO_enrich_bg │ ├── pub-go.obo.tab │ └── synFam-ipr-GOinEnrich └── scripts │ ├── GOenrich_topGO.r │ ├── cnvt_GOobo_to_tab.pl │ ├── cnvt_GOobo_to_tab.r │ ├── enrich_IPR.pl │ ├── enrich_keggPWY.pl │ ├── enrichment_mine_fit.pl │ ├── extend_GOannot_for_GOenrich.pl │ ├── extend_IPRannot_for_IPRenrich.pl │ ├── hs_enrich.pl │ ├── hs_enrich_old.pl │ ├── mk_bg_fromKeggPWY.pl │ ├── prepare_enrich.pl │ ├── run_GOenrich.r │ ├── run_IPRenrich.r │ ├── run_KEGGenrich.r │ ├── split_jnGO.pl │ ├── stat_goslim.pl │ └── test_enrichGO.pl ├── evolution_tools ├── SV_detection │ ├── add_geneAHRD.pl │ ├── add_rdCov2vcf.pl │ ├── align_2seq_by_query_segments.pl │ ├── assemblytics_scripts │ │ ├── Assemblytics │ │ ├── Assemblytics_ori │ │ ├── Assemblytics_within_alignment.py │ │ └── Assemblytics_within_alignment.py_ori │ ├── callsv_AW_NDF.sh │ ├── cmd_list_example_to_detect_sv │ ├── cnvt_anchors_to_tbl.pl │ ├── cnvt_ndfGff2vcf_snps.pl │ ├── cnvt_ndfGff2vcf_struct.pl │ ├── cnvt_ndfGff2vcf_struct_v1.pl │ ├── cnvt_vcf_sub2ins.pl │ ├── cnvt_vcf_sub2insdel.pl │ ├── filter_maf_by_tab.pl │ ├── filter_rdCov_vcf.pl │ ├── find_nonOvlCDS.pl │ ├── fix_sam_cigarID.pl │ ├── fmt_paf.pl │ ├── get_qryLoc_by_refLoc_inBam.py │ ├── get_qry_ref_shared_var_nucdiff.pl │ ├── get_shrt_or_ident_mafTab.pl │ ├── get_sv_affected_genes.pl │ ├── get_sv_inVCF.pl │ ├── gff2bed.pl │ ├── if_needIDfix.pl │ ├── join_maf_blk.pl │ ├── join_samAln.pl │ ├── make_fakeCDS_fromPAF.pl │ ├── mmp2Aln_anchorsInMafTab.pl │ ├── nucdiff_modification │ │ ├── class_nucmer.py_revised │ │ ├── cmd_list │ │ ├── initial_preparation.py_revised │ │ └── nucdiff.py_revised │ ├── remove_gap_var.pl │ ├── restore_sam_position.pl │ ├── rmNvar_inVCF.pl │ ├── rm_0span_maf.pl │ ├── rmdup_fromNormVcf.pl │ ├── run_mm2nucdiff.sh │ ├── run_mm2paftool.sh │ ├── run_ndf.sh │ ├── sam2delta.py │ ├── select_var.pl │ ├── summary_svs.pl │ └── view_anchors.pl ├── compare_assemblies │ ├── byMUMmer │ │ ├── cmd_list │ │ ├── stat1_mergeCov.pl │ │ ├── stat2_chk_gapCover.pl │ │ └── stat3_addGeneTag.pl │ ├── cnt_ovl_fromBeds.pl │ ├── cnvt_ploidy_inVCF.pl │ └── mcscanTab_to_dupTxt.pl ├── copy_number_var │ ├── README.md │ ├── compare_gene_expansion.r │ └── get_CLV_expansion.pl ├── cvt_mscGff_scf2chr.pl ├── draw_syn_dotplot.pl ├── expansion_tools │ ├── 01.clean_nwk.pl │ ├── 01.prepare_cafe_tab.pl │ ├── 01.prepare_ortho_to_tbl.pl │ ├── 02.cafe_to_grp.pl │ ├── 03.replace_geneID_in_orthomcl.pl │ ├── 04.get_expansion_tab.pl │ ├── 05.add_desciption_to_OGcsv.pl │ └── jn_gene_byIPR.pl ├── follow_sibelia.pl ├── mummer_tools │ ├── join_coords.pl │ └── mummerplot ├── ortho_tools │ ├── 01.ortho_list_from_orthoOut.pl │ ├── 02.list_run_muscle.pl │ ├── cdsAln2bppAln.pl │ ├── cnvt_fa2nex.pl │ ├── combine_tree_for_treeAnnot.pl │ ├── filter_bp6_byTopScore.pl │ ├── fmt_data_for_orthoAln.pl │ ├── join_cdsFmt_faAln.pl │ ├── mk_sep_blastp_shell.sh │ ├── ortho_cnt_c1_cmn.pl │ ├── replace_all_blast_file.sh │ ├── run_ete3_genetree.pl │ ├── run_positive_selection.pl │ ├── sep_alnFas.pl │ └── trim_faAln.pl ├── plot_color_tree.r ├── plot_label_color_tree.r ├── plot_syn.pl ├── plot_syn.pl_bak ├── plot_syn_bk.pl ├── prepare_SynChro.pl ├── rbh_byBp6.pl ├── rbh_inBlock.pl ├── structure │ ├── 00.run_Structure.pl │ ├── 03.rm_Nmiss_maf.pl │ ├── cnvt_clumppOut_to_tab.pl │ ├── collect_rand_result.pl │ ├── extraparams │ ├── get_LnPD.pl │ ├── get_structure_input.pl │ ├── get_time_from_structScrn.pl │ ├── mainparams │ ├── mv_result_files.pl │ ├── new_mainparams.pl │ ├── order_ClumppIndFileOut_byIndID.pl │ ├── order_structureIndv_byIndID.pl │ ├── prepare_structure_input.pl │ ├── rand_small_position.pl │ ├── result_report.pl │ ├── shrt_col0.pl │ ├── structure │ └── structure_Temple │ │ └── extraparams └── vcf_tab │ ├── add_ref_as_indv_in_vcfTab.pl │ ├── cnt_allele_withPop.pl │ ├── cols2vcfTab.pl │ ├── cvt_snp_to_itayNeed.pl │ ├── extract_dp_per_indv.pl │ ├── slct_sites_fromVCF.pl │ └── tab2vcf.pl ├── extract_fq_by_list.pl ├── file_type_based ├── Proc_Reads │ ├── adaptors.fa │ ├── chk_INS_byChlo.sh │ ├── cleanPE_byTrimmo.sh │ ├── cleanSE_byTrimmo.sh │ ├── illumina_adapters.fa │ ├── polyAT_adp.fa │ ├── run_bowtie.pl │ ├── run_bowtie2.pl │ ├── run_bwaAln.pl │ ├── run_bwaAln.sh │ ├── run_fastqC.pl │ ├── run_ndupB.sh │ ├── run_rmRRNA.pl │ ├── run_rmRRNA.sh │ ├── run_tophat2.pl │ ├── run_trimmoPE.pl │ ├── run_trimmoSE.pl │ ├── run_trimmomatic.pl │ ├── srch_barcode.pl │ ├── trimmomatic │ │ ├── AUTHORS.jbzip2 │ │ ├── FastqRecord.java │ │ ├── LICENCE.jbzip2 │ │ ├── META-INF │ │ │ └── MANIFEST.MF │ │ ├── SlidingWindowTrimmer.java │ │ ├── SlidingWindowTrimmer.java.version1.bk │ │ ├── org │ │ │ ├── itadaki │ │ │ │ └── bzip2 │ │ │ │ │ ├── BZip2BlockCompressor.class │ │ │ │ │ ├── BZip2BlockDecompressor.class │ │ │ │ │ ├── BZip2Constants.class │ │ │ │ │ ├── BZip2DivSufSort$PartitionResult.class │ │ │ │ │ ├── BZip2DivSufSort$StackEntry.class │ │ │ │ │ ├── BZip2DivSufSort$TRBudget.class │ │ │ │ │ ├── BZip2DivSufSort.class │ │ │ │ │ ├── BZip2HuffmanStageDecoder.class │ │ │ │ │ ├── BZip2HuffmanStageEncoder.class │ │ │ │ │ ├── BZip2InputStream.class │ │ │ │ │ ├── BZip2OutputStream.class │ │ │ │ │ ├── BitInputStream.class │ │ │ │ │ ├── BitOutputStream.class │ │ │ │ │ ├── CRC32.class │ │ │ │ │ ├── HuffmanAllocator.class │ │ │ │ │ └── MoveToFront.class │ │ │ └── usadellab │ │ │ │ └── trimmomatic │ │ │ │ ├── Pairomatic.class │ │ │ │ ├── TrimStats.class │ │ │ │ ├── Trimmomatic.class │ │ │ │ ├── TrimmomaticPE.class │ │ │ │ ├── TrimmomaticSE.class │ │ │ │ ├── fasta │ │ │ │ ├── FastaParser.class │ │ │ │ ├── FastaRecord.class │ │ │ │ └── FastaSerializer.class │ │ │ │ ├── fastq │ │ │ │ ├── FastqParser.class │ │ │ │ ├── FastqRecord.class │ │ │ │ ├── FastqRecord.java │ │ │ │ └── FastqSerializer.class │ │ │ │ ├── threading │ │ │ │ ├── BlockOfRecords.class │ │ │ │ ├── BlockOfWork.class │ │ │ │ ├── ParserWorker.class │ │ │ │ ├── SerializerWorker.class │ │ │ │ ├── TrimLogRecord.class │ │ │ │ ├── TrimLogWorker.class │ │ │ │ └── TrimStatsWorker.class │ │ │ │ ├── trim │ │ │ │ ├── AbstractSingleRecordTrimmer.class │ │ │ │ ├── AvgQualTrimmer.class │ │ │ │ ├── BarcodeSplitter.class │ │ │ │ ├── CropTrimmer.class │ │ │ │ ├── HeadCropTrimmer.class │ │ │ │ ├── IlluminaClippingTrimmer$1.class │ │ │ │ ├── IlluminaClippingTrimmer$IlluminaClippingSeq.class │ │ │ │ ├── IlluminaClippingTrimmer$IlluminaLongClippingSeq.class │ │ │ │ ├── IlluminaClippingTrimmer$IlluminaMediumClippingSeq.class │ │ │ │ ├── IlluminaClippingTrimmer$IlluminaPrefixPair.class │ │ │ │ ├── IlluminaClippingTrimmer$IlluminaShortClippingSeq.class │ │ │ │ ├── IlluminaClippingTrimmer.class │ │ │ │ ├── LeadingTrimmer.class │ │ │ │ ├── MaximumInformationTrimmer.class │ │ │ │ ├── MinLenTrimmer.class │ │ │ │ ├── SlidingWindowTrimmer.class │ │ │ │ ├── SlidingWindowTrimmer.java │ │ │ │ ├── ToPhred33Trimmer.class │ │ │ │ ├── ToPhred64Trimmer.class │ │ │ │ ├── TrailingTrimmer.class │ │ │ │ ├── Trimmer.class │ │ │ │ └── TrimmerFactory.class │ │ │ │ └── util │ │ │ │ ├── ConcatGZIPInputStream$1.class │ │ │ │ ├── ConcatGZIPInputStream$GZIPHelperInputStream.class │ │ │ │ ├── ConcatGZIPInputStream.class │ │ │ │ └── PositionTrackingInputStream.class │ │ └── trimmomatic.jar │ ├── using_subfunc.R │ ├── using_subfunc.R.bk │ ├── using_subfunc.R.cmd │ ├── using_subfunc.R.cmd.R │ ├── using_subfunc.R.rm_polyAT.R │ └── using_subfunc.R.rm_polyAT_useRight.R └── Proc_Sam │ ├── get_required_sam.pl │ └── trim_rdEnd_inSam.pl ├── follow_mcscan.pl ├── gm_tools ├── abh_to_qtlCsv.pl ├── cnt_allele_withPop.pl ├── cnvt_abhJn_to_abh.pl ├── cnvt_snp_to_itayNeed.pl ├── cnvt_snp_to_itayNeed_cnt.pl ├── cvt_snp_to_itayNeed.pl ├── rnaseq_snp_tools.pl └── tomato_loc_V2p4_to_V2p5.pl ├── log_tools ├── fastqC_data_summary.pl ├── filt_bwa_log.pl ├── form_cleanQ20_tbl.pl └── infor_ndupB.pl ├── maker_pm_edit ├── PhatHit_utils.pm_edit ├── PhatHit_utils.pm_raw ├── auto_annotator.pm_edit ├── auto_annotator.pm_raw ├── shadow_AED.pm_edit └── shadow_AED.pm_raw ├── pcr_tools ├── cmd_list ├── generate_kaspLGC_template.pl ├── get_priming_loc_bnV2_2_24.pl ├── retrieve_template_forSNP.pl ├── run_primer3_general.pl ├── run_primer_forSNP.pl └── site.lis ├── plotting ├── README.md ├── example_data │ ├── 65K_DEL-3class │ ├── FleshBrix_19YQ_1 │ ├── FleshBrix_19YQ_2 │ ├── FleshBrix_22HN_1 │ ├── FleshBrix_22HN_2 │ └── in.table-for_grouped_barplot_with_SD └── scripts │ ├── plot_barplot.r │ ├── plot_barplot_wiSD_twoGroups.r │ ├── plot_boxplot.r │ ├── plot_upset.r │ └── plot_venn.r ├── project └── watermelon_pan_phaseI │ ├── cmd_list │ ├── cnt_CDS_dup.pl │ ├── cnt_TEIPRacc.pl │ ├── cnt_gene_PAV_byDepCov.pl │ ├── cnt_gene_PAV_byDepCov_fromISbed.pl │ ├── cnvt_ext2nov_to_agp.pl │ ├── comb_gene_PAV_byDepCov.pl │ ├── convert_PAVtab_to_rmOlapTab.pl │ ├── dvd_grp_bySyn.pl │ ├── extract_N50.pl │ ├── extract_map_ratio_from_PEfstat_comb.pl │ ├── extract_novelOriFlankSeq.pl │ ├── list_IPRacc.pl │ ├── map_accDedup_to_asm.pl │ ├── map_dedup_to_novel.pl │ ├── pipe_for_functional_annotation.pl │ ├── ret_maker_abinit_gff3.pl │ ├── ret_maker_abinit_gff3_simpleCDS.pl │ ├── rm_Qloc_only_groups.pl │ ├── rm_Qloc_ovl2pred.pl │ ├── rm_overlap_OGs.pl │ ├── slct_OG_gene_pairs.pl │ ├── summary_covBp_byRdDep.pl │ ├── summary_gene_PAV_byDepCov.pl │ └── trim_gff_to_novel_ctg_region.pl ├── relate_loc_1to1 ├── add_loc_fromNewP.pl ├── cmd_list ├── relate_pos.pl ├── scfP_to_chrP.pl └── sumP_in_bed_wiCDS.pl ├── reseq_tools ├── C_exe │ ├── maskClose_in_1col │ ├── maskClose_in_1col.c │ ├── rmSameSite │ ├── rmSameSite.c │ └── self_functions.c ├── LD_ana │ ├── bin_LD_cnt.pl │ └── pipe_cnt_LD.pl ├── README.md ├── SNP_effect.pl ├── SNP_effect_edit.pl ├── SNP_in_region.pl ├── add_SNP1col_to_basic.pl ├── basic_snp_infor_bySite.pl ├── basic_snp_infor_bySite_inVcfTab.pl ├── bsa │ ├── README.md │ ├── example_data │ │ └── template-QTLseqr_result.xlsx │ └── scripts │ │ ├── ana_bsa_Gprime.pl │ │ ├── cnvt_var2tab.pl │ │ ├── filter_vcfTab_forBsaParent.pl │ │ ├── get_pval_for_Gprime.R │ │ ├── plot_pipeResult.R │ │ ├── plot_pipeResult_minCnt.R │ │ ├── run_QTLseqr.r │ │ ├── slct_sites_by_windows.pl │ │ └── slct_sites_forBsa.pl ├── class_SNPeffect_tbl.pl ├── cnt_NHH_byIndv.pl ├── cnt_Nmiss_ratio_heteNotN.pl ├── cnt_diff_inTbl.pl ├── cnt_genotype_in_1col.pl ├── cnt_genotype_inpileup.pl ├── cnt_homo_hete_ratio.pl ├── cnt_maf_ratio.pl ├── cnt_mapRdN_inBam.pl ├── cnt_pileup_depC.pl ├── cnvt_agp2chain.pl ├── cnvt_tools │ ├── cols2LD.pl │ ├── cols2fas.pl │ ├── cols2fstat.pl │ ├── cols2meg.pl │ ├── cols2pca.pl │ ├── cols2ped.pl │ ├── cols2phy.pl │ ├── cols2tab.pl │ ├── cols2vcf.pl │ ├── fas2meg.pl │ ├── fas2phyml.pl │ ├── qopt2shrtStructure.pl │ ├── tab2cols.pl │ ├── tab2gl.pl │ ├── tbl2LD.pl │ ├── tbl2fas.pl │ ├── tbl2meg.pl │ └── tbl2phy.pl ├── cols2ped.pl ├── cols2vcf.pl ├── detect_mix │ ├── class_grpCnt_in_mjTab.pl │ ├── class_mjAL.pl │ └── class_snpTab_by_mjAL.pl ├── draw_SNP_dist.pl ├── example_data │ ├── list.long_deletions │ ├── list.sample_bam │ ├── out_geno-mat.tab │ └── out_geno-melt.tab ├── extract_pileup.pl ├── extract_sites_by_list.pl ├── extract_top1_vcfPI_wind.R ├── filter_tools │ └── cnt_depth │ │ └── sumDepBySite.pl ├── for_yb63 │ └── spec_inGrp.pl ├── fst │ ├── cnvt_tbl2fstat.pl │ ├── cols2fstat.pl │ ├── extract_top1_fst_site.R │ ├── extract_top1_vcfFst_wind.R │ ├── get_stat.pl │ ├── join_fst_siteChrPos.pl │ ├── pipe_get_fst.pl │ ├── run_hierfstat.pl │ └── snpTbl_sepByWind.pl ├── gatk │ ├── CatVariants.pl │ ├── conf_pipe_gatk │ ├── est_depth_inBam.pl │ ├── gatk_dvd_step8_combineGVCF_interval.pl │ ├── gatk_dvd_step9_gvcf2var.pl │ ├── gatk_dvd_step9_gvcf2var_fromGVCF.pl │ ├── get_pass_vcf.pl │ ├── pipe_gatk.pl │ ├── pipe_gatk_conf │ ├── pipe_gatk_inFqList.pl │ ├── pipe_gatk_singleAlnBam.pl │ ├── pipe_maoSNP.pl │ ├── pipe_sbSNP.pl │ ├── pref_lmyPM │ └── revert_alnBam_to_uBam.pl ├── get_set2_varOnlyHete.pl ├── get_set3_varWiIndel.pl ├── lcnt_to_represent_allele.pl ├── mao_exe │ ├── combine2PileFiles │ ├── reSeqPrintRefChr │ ├── reSeqPrintSample.indel.fast │ ├── reSeqPrintSample.indel.fast.strAssign │ ├── reSeqPrintSample.indel.fast.strAssign.moreHeter │ ├── rmRedunSam2 │ └── rmRedunSam3 ├── maskClose_in_1col.pl ├── mask_vcf_geno_byGQ.pl ├── mask_weiredSNP.pl ├── mk_wind_from_noNlis.pl ├── pca │ ├── cmd_list_set01 │ ├── cnvt_pcaEvec_to_tbl.pl │ ├── cnvt_snp2pca.pl │ └── plot_EVs.R ├── phylo_tools │ ├── generate_dataset.pl │ ├── infer_NJ_nucleotide_bt500.mao │ └── replace_ID_in_nwk.pl ├── rand_site_wiWind.pl ├── rdNum_in_bam.pl ├── rename_plink_map.pl ├── rm_Nmiss_sites.pl ├── rm_adjacent_sites.pl ├── rm_same_site.pl ├── rm_same_site_hete2N.pl ├── scripts │ ├── cntRdMismatch_inSam.pl │ ├── cntRd_spanJunctionSite_inBam.pl │ ├── cnvt_melt_to_matrix.pl │ ├── cnvt_msaFa_to_sam.pl │ └── genotype_longDEL_byListBam.pl ├── slct_sweep │ ├── get_mean.pl │ ├── merge_wind.pl │ ├── merge_wind_pos.pl │ ├── ret_annot_by_loc.pl │ ├── rm_overlap_wind.pl │ ├── rod_from_PIavg.pl │ ├── slct_sweep_wind.pl │ └── wind_in_region_list.pl ├── slim_SNP_sites.pl ├── snpTbl_stats.pl ├── snpeff_data │ ├── extract_coding.pl │ ├── simple_sv_class │ └── simplify_tbl1.pl ├── tassel │ └── cnvt_col_to_TasselTaxaList.pl ├── vcf_simplify_addRef.pl └── xpclr │ ├── chk_nonsyn.pl │ ├── cluster_xpclrscore.pl │ ├── cluster_xpclrscore.pl_ori │ ├── cumPos.pl │ ├── get_uniq_cM.pl │ ├── pipe_xpclr_forWM97.pl │ ├── pipe_xpclr_fromSNPtbl.pl │ ├── pipe_xpclr_fromSNPtbl_cpu.pl │ ├── plot_manhattan │ ├── chrLen │ ├── chrLen_cum │ └── plot_xpclr_cum.R │ ├── prepare_xpclr_input_wiGmP.pl │ ├── sep_run_xpclr.pl │ ├── set_GMpos_to_SNP.pl │ └── xpclr_wind_cmd_wiGmP.pl ├── rnaseq_tools ├── DEG_byList.pl ├── DEGbyEdgeR_exactTest.pl ├── DEGtool_withSizeFactor.pl ├── FET_getCnt.pl ├── R_cmd_list ├── add_sizefactor.pl ├── cntRdInGene_wiHTSeq.pl ├── cnt_rd_in_bam.pl ├── cnt_uniqMap_in_bam.pl ├── cnvt_cnt_to_TPM.pl ├── coexp │ ├── add_abs_toKME.pl │ ├── cmd_list │ ├── dist_of_twoKME.pl │ ├── down_phenoAssoc_binary.r │ ├── down_phenoAssoc_pearson.r │ ├── heatmap_by_mod_dist.r │ ├── input │ │ ├── dat1_pheno │ │ └── dat1_rpkmMean.gz │ ├── redo_pheno_ass.r │ ├── run_wgcna_signed.r │ └── run_wgcna_unsigned.r ├── combine_samCnt.pl ├── combine_samCnt_by_Pref.pl ├── combine_samCnt_by_Pref_antiSense.pl ├── compare_SensAnti.pl ├── draw_SNP.pl ├── extract_samAln_by_fq.pl ├── fix_SRAfqID.pl ├── fix_excelV.pl ├── fromOthers │ ├── record │ └── run_TMM_scale_matrix.pl ├── get_DESeqNormCnt.pl ├── get_MPCnt.pl ├── get_meanTPM.pl ├── graft │ ├── find_transmit_step1.pl │ ├── find_transmit_step2.pl │ ├── find_transmit_step3.pl │ ├── find_transmit_stepX1.pl │ ├── find_transmit_stepX2.pl │ ├── get_alnBam_by_src2tgt_rdList.pl │ └── simple_pipe_find_graft_rd_SE.pl ├── join_samCnt.pl ├── map_to_genome │ ├── README.md │ ├── cnvt_featureCounts_to_tpm.r │ ├── cnvt_gene2group_DEGlabel.pl │ ├── cnvt_gene2group_val.pl │ ├── combine_DEGs.pl │ ├── fix_NHnum.pl │ ├── label_DEGs.pl │ ├── runHisat2_with2pass.pl │ └── run_deseq2_tpm.r ├── map_to_transcriptome │ ├── README.md │ ├── cnvt_synOGgrp_to_trans2gene.pl │ ├── get_salmon_gene_quant_batch.r │ └── run_deseq2_salmon.r ├── plot_expr_heatmap.r ├── plot_heatmap_by_geneList.pl ├── rmRRNA_in_fqFiles.pl ├── sep_reads_by_toRef.pl └── summary_ht2_log.pl ├── rscript_examples └── ggplot_manhattan.r ├── run_clean_mp_all.R ├── run_cmd_in_batch.pl ├── run_reapr.sh ├── sam_filter.pl ├── sam_flag_chk.pl ├── sample_scripts ├── PG1_GC.cfg ├── PG1_ctg.cfg ├── PG1_scf.cfg ├── check_pm_version.pl ├── cmd_list_trinity_denovo ├── run_jf2.sh ├── run_soapd2_R02.sh ├── run_trinity_guided.sh ├── sumV_inWind.pl └── svg2png.pl ├── save_mate_fq.pl ├── save_single_fq.pl ├── self_interest ├── list_all_dir.pl └── price_by_subway.pl ├── sepRun_cmd.pl ├── site_search ├── cmd_list ├── parse_fimoTSV.pl ├── setup_keySite.pl └── yyt_motif.meme ├── software_fix ├── anchorwave │ └── fix_awMAF.pl └── last │ └── v869 │ ├── last-dotplot │ ├── last-map-probs │ ├── last-postmask │ ├── last-train │ ├── maf-convert │ ├── maf-join │ └── maf-swap ├── solQ2phredQ.pl └── temp ├── README.md ├── abh_to_rate.pl ├── cds2prot.pl ├── chk_gff3_avg_cds.pl ├── cmd_list_busco ├── cmd_list_cegma ├── cmd_list_tassel4 ├── cmd_list_tassel5 ├── cnvt_pairwise_to_tab.pl ├── deal_gff3.pl ├── detect_syn_dots.pl ├── forRonan ├── addID_to_loci.pl ├── depC_cutoff_by_dep_stat.pl ├── filter_sam.pl ├── map_region_by_bn6.pl ├── merge_tbls.pl └── samDep_to_loci.pl ├── get_cds_from_gff3.pl ├── good_pos.pl ├── ncbi_esearch.pl ├── plot_boxplot_wi_points.r ├── reformat_tabHit.pl ├── replace_unicode.pl ├── rm_gff_byLis.pl ├── scripts └── cnvt_pdf_to_tiff.pl ├── simple_gff3_to_gtf.pl ├── simple_sort_gff3.pl ├── slct_gff_byLis.pl ├── temp_fix_gff3 ├── Grif_1614.fix.gff3.gz ├── Grif_1614.gff3.gz ├── cmd_list └── fix_gff.pl └── temp_process_ONT ├── cdna_classifier_report.pdf ├── cdna_classifier_report.tsv ├── cmd_list ├── correction_pipeline.sh ├── raw_rdN.tbl ├── test_SIRV ├── cdna_classifier_report.pdf ├── cdna_classifier_report.tsv ├── cmd_list ├── correction_pipeline.sh ├── scrn.SIRV_test.gz └── scrn.corr ├── test_small └── cmd_list └── tools └── correction_pipeline.sh /.gitignore: -------------------------------------------------------------------------------- 1 | blib/ 2 | .build/ 3 | _build/ 4 | cover_db/ 5 | inc/ 6 | Build 7 | !Build/ 8 | Build.bat 9 | .last_cover_stats 10 | Makefile 11 | Makefile.old 12 | MANIFEST.bak 13 | META.yml 14 | MYMETA.yml 15 | nytprof.out 16 | pm_to_blib 17 | -------------------------------------------------------------------------------- /GBS/ext_rd_withAPEKI.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | 6 | !@ARGV and die "perl $0 barcode input.fastq\n\nI use CAGC/CTGC as the required restricted site.\nPlease note that I will ignore read if meeting Ns within the first 64bp.\n\n"; 7 | 8 | my $barC = shift; 9 | my $addCut1 = "${barC}CAGC"; 10 | my $addCut2 = "${barC}CTGC"; 11 | my $ll = length($barC); 12 | 13 | my $n = 0; 14 | while (my $id = <>) { 15 | $n ++; 16 | $n % 1e6 == 1 and &tsmsg("[Msg] $n reads treated.\n"); 17 | my $seq=<>; 18 | <>; 19 | my $qual = <>; 20 | if ($seq =~ m/^(?:$addCut1|$addCut2)/o) { 21 | my $sub_seq = substr($seq, 0, 64+$ll); 22 | $sub_seq =~ m/N/ and next; 23 | print STDOUT "$id$seq+\n$qual"; 24 | } 25 | } 26 | 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Frequently-used-tools-for-data-processing 2 | ========================================= 3 | 4 | Tool set for processing fasta/fastq/table formated data. Usually they are perl scripts. 5 | -------------------------------------------------------------------------------- /annot_tools/add_tag_to_gffID.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | !@ARGV and die "perl $0 tag in.fmt.gff3\n"; 6 | 7 | my $tag = shift; 8 | 9 | while (<>) { 10 | chomp; 11 | if (m/^(\s*$|#)/) { 12 | print "$_\n"; 13 | next; 14 | } 15 | my @ta = split(/\t/, $_); 16 | if ($ta[2] =~ m!^(protein_match)$!i) { 17 | $ta[8] =~ s!^ID=!ID=$tag!; 18 | } elsif ($ta[2] =~ m!^(match_part)$!i) { 19 | $ta[8] =~ s!(^|\s|;)Parent=!$1Parent=$tag!; 20 | } elsif ($ta[2] =~ m!^dispersed_repeat$!i) { 21 | $ta[8] =~ s!^ID=!ID=$tag!; 22 | } else { 23 | die "$_\n"; 24 | } 25 | print join("\t", @ta)."\n"; 26 | } 27 | 28 | -------------------------------------------------------------------------------- /annot_tools/ahrd/cmd_list: -------------------------------------------------------------------------------- 1 | perl pipe_for_functional_annotation.pl -onlyAHRD annot.cfg 2 | mkdir -p 04.AHRD/v1/ 3 | cd 04.AHRD/ 4 | bash ../generate_ahrd_yml.sh ../evmMerged.p.fa ./v1/ahrd_output_v1.csv > ahrd_in_v1.yml 5 | java -Xmx10G -jar /data/Sunhh/src/annotation/ahrd/AHRD/dist/ahrd.jar ahrd_in_v1.yml 6 | perl ../trim_orphan_right_brack.pl ./v1/ahrd_output_v1.csv > ./v1/ahrd_output_v1.csv_trim 7 | deal_table.pl -column 0-5 ./v1/ahrd_output_v1.csv_trim > ./v1/ahrd_output_v1.final.csv 8 | deal_table.pl -column 0,3 ./v1/ahrd_output_v1.final.csv > ./v1/ahrd_output_v1.final.csv.2col 9 | 10 | cat ./v1/ahrd_output_v1.final.csv.2col | deal_table.pl -col_repCount 1 | tail -n +2 | deal_table.pl -col_sort 0 | deal_table.pl -reverse > ./v1/ahrd.rcnt 11 | cat ./v1/ahrd.rcnt | perl -e 'while (<>) { chomp; m!retro|reverse|transpos|gag\b|polyprotein!i and print "$_\n";}' | less -S 12 | 13 | # False positives - functions that are not TE-related but match the regular expression pattern: 14 | # AT3G25590: Micronuclear linker histone polyprotein-like protein 15 | 16 | -------------------------------------------------------------------------------- /annot_tools/augustus.accuracy_calculator.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | !@ARGV and die "perl $0 augusts_test1.stdout augusts_test2.stdout > all_accuracy.tbl\n"; 6 | 7 | for my $fn (@ARGV) { 8 | my $v1 = &accuracy_calculator($fn); 9 | print STDOUT "$fn\t$v1\n"; 10 | } 11 | 12 | # calculate the result of testing AUGUSTUS on genbank files in a single number 13 | # Copied from braker.pl 14 | sub accuracy_calculator{ 15 | my $aug_out=shift; 16 | my ($nu_sen, $nu_sp, $ex_sen, $ex_sp, $gen_sen, $gen_sp); 17 | open(AUGOUT, "$aug_out") or die ("Could not open $aug_out!\n"); 18 | while(){ 19 | if(/^nucleotide level\s*\|\s*(\S+)\s*\|\s*(\S+)/){ 20 | $nu_sen=$1; 21 | $nu_sp=$2; 22 | } 23 | if(/^exon level\s*\|.*\|.*\|.*\|.*\|.*\|\s*(\S+)\s*\|\s*(\S+)/){ 24 | $ex_sen=$1; 25 | $ex_sp=$2; 26 | } 27 | if(/^gene level\s*\|.*\|.*\|.*\|.*\|.*\|\s*(\S+)\s*\|\s*(\S+)/){ 28 | $gen_sen=$1; 29 | $gen_sp=$2; 30 | } 31 | } 32 | my $target=(3*$nu_sen+2*$nu_sp+4*$ex_sen+3*$ex_sp+2*$gen_sen+1*$gen_sp)/15; 33 | return $target; 34 | } 35 | 36 | -------------------------------------------------------------------------------- /annot_tools/b2g_graph_software/fmt_b2g_enrich.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t and !@ARGV and die "perl $0 3.Locule.Up.full.txt > 3.Locule.Up.full.txt.tab\n"; 6 | 7 | while (<>) { 8 | chomp; 9 | my @ta=split(/\t/, $_); 10 | if ($ta[0] eq 'Tags') { 11 | $ta[10] eq 'TestSet Sequences' or die "Bad columns\n"; 12 | print join("\t", @ta[1..5], 'Total DEGs', 'DEGs annotated', 'Genes annotated', 'Total genes', "Genes")."\n"; 13 | next; 14 | } 15 | my $ttl_geneN = $ta[6]+$ta[7]+$ta[8]+$ta[9]; 16 | my $ttl_degN = $ta[6]+$ta[8]; 17 | $ta[10] //= ""; 18 | print join("\t", @ta[1..5], $ttl_degN, $ta[6], $ta[7], $ttl_geneN, $ta[10])."\n"; 19 | } 20 | 21 | # 0 Tags 22 | # 1 GO ID 23 | # 2 GO Name : description of the category; 24 | # 3 GO Category: category of the function; 25 | # 4 FDR 26 | # 5 P-Value 27 | # 6 Nr Test : number of transcripts in the sample with that function 28 | # 7 Nr Reference: number of transcripts in the reference transcriptome with that function 29 | # 8 Non Annot Test: number of transcripts without that function in the sample. 30 | # 9 Non Annot Reference: number of transcripts without that function in the reference. 31 | # 10 TestSet Sequences 32 | 33 | -------------------------------------------------------------------------------- /annot_tools/evm_tools/evm_weight.txt: -------------------------------------------------------------------------------- 1 | PROTEIN spliced_protein_alignments 1 2 | -------------------------------------------------------------------------------- /annot_tools/evm_tools/filter_cds2Bad_bn6.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t and !@ARGV and die "perl $0 evmMerged.c.fa.toBad.bn6 > evmMerged.c.fa.toBad.bn6.toRM\n"; 6 | 7 | my $min_ident = 97; 8 | my $min_percCov = 95; # 9 | 10 | my %h; 11 | 12 | while (<>) { 13 | chomp; 14 | my @ta=split(/\t/, $_); 15 | defined $h{$ta[0]} and next; 16 | $ta[2] >= $min_ident or next; 17 | $ta[7]-$ta[6]+1 >= $ta[12] * $min_percCov / 100 or next; 18 | $h{$ta[0]} = 1; 19 | print STDOUT "$_\n"; 20 | } 21 | 22 | -------------------------------------------------------------------------------- /annot_tools/evm_tools/good_desc: -------------------------------------------------------------------------------- 1 | Micronuclear linker histone polyprotein-like protein 2 | -------------------------------------------------------------------------------- /annot_tools/evm_tools/param_list: -------------------------------------------------------------------------------- 1 | C31 C38 0.97 0.95 0.97 2 | C31 C39 0.97 0.95 0.97 3 | C38 C37 0.97 0.95 0.97 4 | C38 C39 0.97 0.95 0.97 5 | C39 C37 0.97 0.95 0.97 6 | C39 C38 0.97 0.95 0.97 7 | -------------------------------------------------------------------------------- /annot_tools/example_pipes/bad_prot_IDs: -------------------------------------------------------------------------------- 1 | CmaCh14G019040.1 2 | CmoCh01G021080.1 3 | CmUC00G223800.1 4 | AT1G44191.1 5 | sp|P12978|EBNA2_EBVB9 6 | -------------------------------------------------------------------------------- /annot_tools/example_pipes/cmd_list_protAln_single_1add: -------------------------------------------------------------------------------- 1 | # Re-align on SWIFT server. 2 | cd /data/Sunhh/src/align/spaln/spaln2.1.4.linux64/seqdb; ln -s /data/Sunhh/wmhifi/analysis/gene_prediction/db/in_genome/22CEXU3.chr.fa ./22CEXU3.mfa; ./makeidx.pl -inp 22CEXU3.mfa; cd -; 3 | 4 | deal_fasta.pl CmoshV1.p.fa -nres CmoCh01G021080.1 > CmoshV1.p_fixed.fa 5 | ls CmoshV1.p_fixed.fa > lis_prot_r2 6 | perl /home/Sunhh/tools/github/NGS_data_processing/annot_tools/run_spaln_prot2genom.pl -aln_type prot2genome -db 22CEXU3 -inFaLis lis_prot_r2 -para_spaln " -t1 -M4 -Q7 -O0 -LS " -cpuN 100 -cnvt2maker -pl_cnvt2maker /home/Sunhh/tools/github/NGS_data_processing/annot_tools/cnvt_spaln2makerAln_prot_gff3.pl 7 | 8 | mv CmoshV1.p_fixed.fa.spaln.gff3 CmoshV1.p.fa.spaln.gff3 9 | perl /home/Sunhh/tools/github/NGS_data_processing/annot_tools/rmShrtExon_spaln_prot2genom.pl CmoshV1.p.fa.spaln.gff3 > CmoshV1.p.fa.s1.gff3 10 | perl /home/Sunhh/tools/github/NGS_data_processing/annot_tools/cnvt_spaln2makerAln_prot_gff3.pl -protKLfile protein_kl -trimOverflow -scafKLfile scaffold_kl CmoshV1.p.fa.s1.gff3 -outFile CmoshV1.p.fa.s2.gff3 11 | perl /home/Sunhh/tools/github/NGS_data_processing/annot_tools/pasa_gff_to_alnGff.pl -notPasa -addTag "3.9:prot:" CmoshV1.p.fa.s2.gff3 > CmoshV1.p.fa.s2.4maker.gff3 12 | -------------------------------------------------------------------------------- /annot_tools/fix_1bpLoc_by_zff2Gb.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | while (<>) { 6 | while (s/complement\((\d+)\)/$1..$1/) { 7 | 1; 8 | } 9 | print; 10 | } 11 | -------------------------------------------------------------------------------- /annot_tools/get_gff_byScfID.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | 6 | use Getopt::Long; 7 | my %opts; 8 | GetOptions(\%opts, 9 | "help!", 10 | "faF:s", "gffF:s", 11 | "scfID:s", "suff:s", 12 | ); 13 | sub usage { 14 | print <',"cur$add.gff3" or die; 37 | while () { 38 | m/^$id\t/o or next; 39 | chomp; 40 | my @ta = split(/\t/, $_); 41 | # unless ($ta[1] =~ m/^pred_gff:augustus|maker|pred_gff:augustus_masked|pred_gff:snap_masked$/) { 42 | # $ta[8] =~ s!Name=[^;\s]+;?!!g; 43 | $ta[8] =~ s!Target=[^;\s]+;?!!g; 44 | # } 45 | print O join("\t", @ta)."\n"; 46 | } 47 | close O; 48 | close F; 49 | exeCmd( "deal_fasta.pl $faF -res $id > cur$add.fasta" ); 50 | -------------------------------------------------------------------------------- /annot_tools/get_maker_result.sh: -------------------------------------------------------------------------------- 1 | ### Basic functions. 2 | function exe_cmd { 3 | echo "[$(date)][CMD] $1" 4 | eval "$1" 5 | echo "[$(date)][Rec] Done." 6 | } 7 | 8 | function tsmsg { 9 | echo "[$(date)]$1" 10 | } 11 | 12 | roundN=4 13 | 14 | path_gffMerge=gff3_merge 15 | path_fasMerge=fasta_merge 16 | 17 | db_idx=PG1All_v2_Scf.unmsk_master_datastore_index.log 18 | 19 | exe_cmd "mkdir ../result/r$roundN" 20 | exe_cmd "$path_gffMerge -d $db_idx -g -o ../result/r${roundN}/r${roundN}_maker.gff3" 21 | exe_cmd "$path_gffMerge -d $db_idx -n -o ../result/r${roundN}/r${roundN}_all.gff3" 22 | exe_cmd "$path_fasMerge -d $db_idx -o ../result/r${roundN}/r${roundN}" 23 | 24 | -------------------------------------------------------------------------------- /annot_tools/iprscan/cmd_list: -------------------------------------------------------------------------------- 1 | # Remove TE-associated proteins according to InterPro IDs (iprscan). 2 | ### The file 'potential_TE_IPRacc' is updated on [2/1/2022] 3 | awk -F "\t" '$12 ~ /^IPR/' in.ipr.tsv > in.ipr.tsv.IPRacc 4 | perl list_IPRacc.pl in.ipr.tsv.IPRacc > in.ipr.tsv.IPRacc.line 5 | perl cnt_TEIPRacc.pl potential_TE_IPRacc in.ipr.tsv.IPRacc.line > in.ipr.tsv.IPRacc.line.cnt 6 | 7 | 8 | -------------------------------------------------------------------------------- /annot_tools/iprscan/cnvt_iprJson2tbl.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t and !@ARGV and die "perl $0 ipr.SearchResults-transposases.json > ipr.SearchResults-transposases.json.tbl\n"; 6 | 7 | print STDOUT join("\t", qw/id source source_database name description/)."\n"; 8 | while (<>) { 9 | m!^\s*(#|$)! and next; 10 | chomp; 11 | my @lines = &parse_iprJson($_); 12 | for my $a1 (@lines) { 13 | for my $k1 (keys %$a1) { 14 | $a1->{$k1} //= "NA"; 15 | } 16 | print STDOUT join("\t", @{$a1}{qw/id source source_database name description/})."\n"; 17 | } 18 | } 19 | 20 | sub parse_iprJson { 21 | my ($txt1) = @_; 22 | $txt1 =~ s!^\s*\[(.*)\]\s*$!$1! or die "Err 1:$txt1\n"; 23 | my @back; 24 | while ($txt1 =~ s!^\s*\{ "id":"(\S+?)", "source":"([^"]+)", "fields":\{ "description":\[(?:"(.*?)")?\], "name":\["([^"]+)"\], "source_database":\["([^"]+)"\] \} \s* \}\s*,*!!x) { 25 | my %h; 26 | @h{qw/id source description name source_database/} = ($1, $2, $3, $4, $5); 27 | push(@back, \%h); 28 | } 29 | $txt1 =~ m!^\s*$! or die "Err 2: |$txt1|\n"; 30 | return(@back); 31 | }# parse_iprJson() 32 | 33 | # { 34 | #"id":"PTHR22955", 35 | #"source":"interpro7_family", 36 | #"fields":{ 37 | # "description":[], 38 | # "name":["RETROTRANSPOSON"], 39 | # "source_database":["PANTHER"] 40 | #} 41 | #} 42 | 43 | -------------------------------------------------------------------------------- /annot_tools/iprscan/list_IPRacc.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t and !@ARGV and die "perl $0 ipr_all6_tsv.TEprot.IPRacc > ipr_all6_tsv.TEprot.IPRacc.line\n"; 6 | 7 | my %h; 8 | while (<>) { 9 | chomp; 10 | my @ta=split(/\t/, $_); 11 | $h{$ta[0]}{'ipr'}{$ta[11]} = $ta[12]; 12 | } 13 | for my $id1 (sort keys %h) { 14 | my @k1 = sort keys %{$h{$id1}{'ipr'}}; 15 | my @v1 = @{$h{$id1}{'ipr'}}{@k1}; 16 | print join("\t", $id1, join(";;", @k1), join(";;", @v1))."\n"; 17 | } 18 | -------------------------------------------------------------------------------- /annot_tools/keep_nonRedundant_list.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | my %h; 6 | while (<>) { 7 | chomp; 8 | my @ta = split(/\t/, $_); 9 | my $tk = "$ta[0]\t$ta[1]"; 10 | $h{$tk} = [ @ta[12,13,10] ]; 11 | } 12 | 13 | my %skip; 14 | my %rm; 15 | for my $tk ( sort {$h{$a}[2] <=> $h{$b}[2] || $h{$b}[0] <=> $h{$a}[0]} keys %h ) { 16 | my ($id1, $id2) = split(/\t/, $tk); 17 | my ($len1, $len2, $score) = @{$h{$tk}}; 18 | defined $skip{$tk} and next; 19 | if ( $len2 > $len1 ) { 20 | if ( !defined $rm{$id2} ) { 21 | $rm{$id1} = $h{$tk}[0]; 22 | } 23 | } else { 24 | if ( !defined $rm{$id1} ) { 25 | $rm{$id2} = $h{$tk}[1]; 26 | } 27 | } 28 | $skip{"$tk"} = 1; 29 | $skip{"$id2\t$id1"} = 1; 30 | } 31 | for (sort keys %rm) { 32 | print "$_\t$rm{$_}\n"; 33 | } 34 | 35 | -------------------------------------------------------------------------------- /annot_tools/kegg/1_extract_KeggMapRes.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Copy the returned text from kegg mapper - search pathway (https://www.kegg.jp/kegg/tool/map_pathway1.html) 3 | # Run this script to get a table with repeated KO IDs. 4 | use strict; 5 | use warnings; 6 | 7 | my %h; 8 | while (<>) { 9 | chomp; 10 | m!^\s*$! and next; 11 | if (m!^map\d+!) { 12 | %h = (); 13 | m!^(map\d+) (.+) \(\d+\)$! or die "$_\n"; 14 | $h{'mapID'} = $1; 15 | $h{'mapDesc'} = $2; 16 | } elsif (m!^\s\sko:(K\d+) (.+)$!) { 17 | my ($kid, $kdesc) = ($1, $2); 18 | print STDOUT join("\t", $kid, $kdesc, $h{'mapID'}, $h{'mapDesc'})."\n"; 19 | } else { 20 | die "$_\n"; 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /annot_tools/kegg/2_join_mapIDs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | my %h; 6 | while (<>) { 7 | chomp; 8 | my @ta=split(/\t/, $_); 9 | $h{$ta[0]}{'desc'}{$ta[1]}++; 10 | $h{$ta[0]}{'map'}{"$ta[2]__($ta[3])"} ++; 11 | $h{$ta[0]}{'rank'} //= $.; 12 | } 13 | 14 | for my $id (sort keys %h) { 15 | scalar(keys %{$h{$id}{'desc'}}) == 1 or die "$id\n"; 16 | my ($desc) = (keys %{$h{$id}{'desc'}}); 17 | print STDOUT join("\t", $id, $desc, join(";;", sort keys %{$h{$id}{'map'}}))."\n"; 18 | } 19 | -------------------------------------------------------------------------------- /annot_tools/kegg/record: -------------------------------------------------------------------------------- 1 | KO annotation : 2 | Use BlastKOALA (Version 2.1, https://www.kegg.jp/blastkoala/) to annotate KO definitions for genes (~5k genes each time/batch). 3 | Enter taxonomy group of your genome : Using taxonomy ID : 3653 (Citrullus genus) 4 | Enter KEGG GENES database file to be searched : genus_eukaryotes 5 | Enter your email address : sunhonghe_1984@163.com (Only one data could be run at the same time with the same email address). 6 | 7 | Download KO_definition (details: "View"=>Download) from the resultant web-link. The output table's header is 'Gene_ID \t KO \t Definition \t Score \t Second-KO \t Second-Score'. 8 | Save "Reconstruct Pathway" webpage for pathway table construction. 9 | 10 | Reconstruct pathway : 11 | Use "KEGG Mapper – Reconstruct Pathway" tool : https://www.kegg.jp/kegg/tool/map_pathway.html 12 | Different 'KEGG Mapper' version may provides slightly different pathway mapping results, so I would like to recommand re-construct the maps at the same time for each analysis, and to record the 'KEGG Mapper' version used. 13 | Input genelist.txt file : 14 | format example : 15 | geneID \t KO_ID 16 | geneID \t KO_ID 17 | ... 18 | Click 'Show all objects' to expand all list; 19 | 20 | 21 | -------------------------------------------------------------------------------- /annot_tools/liftoff_tools/blk2bed.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t and !@ARGV and die "perl $0 output/CX.cds.blk.CL > output/CX.cds.blk.CL.bed\n"; 6 | 7 | while (<>) { 8 | chomp; 9 | my @ta=split(/\t/, $_); 10 | my $i=0; 11 | for my $tb (split(/;/, $ta[3])) { 12 | $i++; 13 | $tb =~ m!^(\d+)\,(\d+)$! or die "$tb\n"; 14 | print join("\t", $ta[1], $1-1, $2, "$ta[0]_HS$i")."\n"; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /annot_tools/liftoff_tools/blk2gff.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t and !@ARGV and die "perl $0 output/CX.cds.blk.CL > output/CX.cds.blk.CL.gff3\n"; 6 | 7 | while (<>) { 8 | chomp; 9 | my @ta=split(/\t/, $_); 10 | my $i=0; 11 | my (@cds, $cS, $cE); 12 | for my $tb (split(/;/, $ta[3])) { 13 | $tb =~ m!^(\d+)\,(\d+)$! or die "$tb\n"; 14 | push(@cds, [$ta[1], "blk", "CDS", $1, $2, ".", $ta[2], ".", "Parent=$ta[0]"]); 15 | $cS //= $1; $cS > $1 and $cS = $1; 16 | $cE //= $2; $cE < $2 and $cE = $2; 17 | } 18 | print join("\t", $ta[1], "blk", "gene", $cS, $cE, ".", $ta[2], ".", "ID=$ta[0]-G")."\n"; 19 | print join("\t", $ta[1], "blk", "mRNA", $cS, $cE, ".", $ta[2], ".", "ID=$ta[0];Parent=$ta[0]-G")."\n"; 20 | for my $a1 (@cds) { 21 | print join("\t", @$a1)."\n"; 22 | } 23 | } 24 | 25 | -------------------------------------------------------------------------------- /annot_tools/liftoff_tools/chk_only_pan.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t and !@ARGV and die "perl $0 comb.grp2.novl_loc.wiRepre.fmt > comb.grp2.novl_loc.wiRepre.fmt.ifOnlyPan\n"; 6 | 7 | while (<>) { 8 | chomp; 9 | $. == 1 and do { print "$_\n"; next; }; 10 | my @ta=split(/\t/, $_); 11 | my @ifpan; 12 | for (my $i=7; $i<@ta; $i++) { 13 | for my $tb (split(/;/, $ta[$i])) { 14 | $tb =~ m!^\S+:\d+\-\d+:[+-]$! and next; 15 | $tb =~ m!^C.Pan! and next; 16 | $ifpan[$i-7] ++; 17 | } 18 | } 19 | for (my $i=0; $i<4; $i++) { 20 | $ifpan[$i] //= 0; 21 | } 22 | print join("\t", @ta[0..6], @ifpan)."\n"; 23 | } 24 | -------------------------------------------------------------------------------- /annot_tools/liftoff_tools/cnvt_gff_to_cdsBed.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | my $htxt = < $_.CDS.bed 17 | 18 | my $opref = shift; 19 | 20 | open OP,'>',"$opref.p.CDS.bed" or die; 21 | open OM,'>',"$opref.m.CDS.bed" or die; 22 | my %h; 23 | while (<>) { 24 | m!^\s*(#|$)! and next; 25 | chomp; 26 | my @ta=split(/\t/, $_); 27 | $ta[2] =~ m!^CDS$!i or next; 28 | $ta[8] =~ m!Parent=([^\s;]+)! or die "$_\n"; 29 | $h{$1} ++; 30 | if ($ta[6] eq '+') { 31 | print OP join("\t", $ta[0], $ta[3]-1, $ta[4], "${1}_HS$h{$1}", $1)."\n"; 32 | } elsif ($ta[6] eq '-') { 33 | print OM join("\t", $ta[0], $ta[3]-1, $ta[4], "${1}_HS$h{$1}", $1)."\n"; 34 | } else { 35 | die "[Err] Bad str line: $_\n"; 36 | } 37 | } 38 | 39 | -------------------------------------------------------------------------------- /annot_tools/liftoff_tools/filter_R2Q_liftoff_tbl.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | my @a1; 6 | while (<>) { 7 | chomp; 8 | my @ta=split(/\t/, $_); 9 | $ta[0] eq 'trans_R_ID' and do { print "$_\n"; next; }; 10 | # (1) Require coverage > 0.9; 11 | $ta[1] >= 0.9 or next; 12 | # (2) Require at least 100 bp or 50% of shorter CDS overlapping. or no overlapping to any gene. 13 | $ta[9] eq '.' or $ta[3] >= 100 or $ta[3] >= 0.5 * $ta[7] or $ta[3] >= 0.5 * $ta[10] or next; 14 | print "$_\n"; 15 | } 16 | 17 | -------------------------------------------------------------------------------- /annot_tools/liftoff_tools/fit_gff_4igv.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t and !@ARGV and die "perl $0 in.gff3 > in.rmGene.gff3\n"; 6 | 7 | while (<>) { 8 | m!^\s*(#|$)! and next; 9 | chomp; 10 | my @ta=split(/\t/, $_); 11 | scalar(@ta) > 6 or next; 12 | $ta[2] =~ m!^(mRNA|CDS)$!i or next; 13 | s!Name=[^\s;]+!!; 14 | print "$_\n"; 15 | } 16 | -------------------------------------------------------------------------------- /annot_tools/liftoff_tools/grp2single.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | while (<>) { 6 | chomp; 7 | my @ta=split(/\t/, $_); 8 | for my $tb (@ta[2..$#ta]) { 9 | print "$tb\t$ta[0]\n"; 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /annot_tools/liftoff_tools/info_bedtools_intersect.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # [3/25/2022] The column number may be different!!! 3 | # [9/28/2022] In some bedtools version, unmapped is presented as '-1' instead of '.'. 4 | # [2/13/2025] Allow empty file in .m. bedtools intersections. 5 | use strict; 6 | use warnings; 7 | 8 | -t and !@ARGV and die "perl $0 bedtools_intersect_wao.out > bedtools_intersect_wao.out.tbl\n"; 9 | 10 | my $cN_mID_1 = 4; 11 | my $cN_mID_2 = 9; 12 | my $cN_ovlLen = 10; 13 | 14 | my %h; 15 | my %trans_cdsLen; 16 | while (<>) { 17 | chomp; 18 | my @ta=split(/\t/, $_); 19 | if (scalar(@ta) == $cN_ovlLen - 1) { 20 | splice(@ta, $#ta, 0, '.', '.'); 21 | } 22 | $ta[$cN_mID_2] eq '-1' and $ta[$cN_mID_2] = '.'; 23 | $h{$ta[$cN_mID_1]}{$ta[$cN_mID_2]} += $ta[$cN_ovlLen]; 24 | $trans_cdsLen{$ta[$cN_mID_1]} += $ta[$cN_ovlLen]; 25 | } 26 | my @o1; 27 | for my $g1 (keys %h) { 28 | my @tg2 = keys %{$h{$g1}}; 29 | for my $g2 (@tg2) { 30 | if ($g2 eq "." and scalar(@tg2) > 1) { 31 | next; 32 | } 33 | push(@o1, [$g1, $g2, $h{$g1}{$g2}]); # [mID_1, mID_2, overlap_size] 34 | } 35 | } 36 | print STDOUT join("\t", qw/trans_mrnaID tgt_mrnaID trans_cdsLen trans_ovlLen/)."\n"; 37 | for my $a1 (sort {$b->[2] <=> $a->[2]} @o1) { 38 | print STDOUT join("\t", $a1->[0], $a1->[1], $trans_cdsLen{$a1->[0]}, $a1->[2])."\n"; 39 | } 40 | 41 | -------------------------------------------------------------------------------- /annot_tools/liftoff_tools/prepare_gff3_to_blk.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | !@ARGV and die "perl $0 CLpan CLpan.trim2CDS.gff3.JnLoc > CLpan.trim2CDS.blk\n"; 6 | 7 | my $tag = shift; 8 | my $fn_JnLoc = shift; 9 | 10 | $tag ne '' and $tag .= ':'; 11 | 12 | open F,'<', $fn_JnLoc or die; 13 | while () { 14 | chomp; 15 | my @ta=split(/\t/, $_); 16 | $ta[0] eq 'mrnaID' and next; 17 | my @se = split(/;/, $ta[9]); 18 | my $cdsL = 0; 19 | for my $tb (@se) { 20 | $tb =~ m!^(\d+)\,(\d+)$! or die "[Err] bad blk format |$tb|\n"; 21 | $cdsL += ($2-$1+1); 22 | } 23 | print STDOUT join("\t", "$tag$ta[0]", @ta[2,5,9], '.', $cdsL)."\n"; 24 | } 25 | close F; 26 | 27 | -------------------------------------------------------------------------------- /annot_tools/liftoff_tools/remove_ovl_loc.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | !@ARGV and die "perl $0 comb.grp2 > comb.grp2.novl_loc\n"; 6 | 7 | while (<>) { 8 | chomp; 9 | my @ta=split(/\t/, $_); 10 | my (@gID, @loci); 11 | for my $tb (@ta[2..$#ta]) { 12 | if ($tb =~ m!^(\S+):(\d+)\-(\d+):([+-])$!) { 13 | push(@loci, [$1, $2, $3, $4, $3-$2+1]); # chrID, start, end, str, span_length 14 | } else { 15 | push(@gID, $tb); 16 | } 17 | } 18 | my @new_loci; 19 | @loci = sort { $b->[4] <=> $a->[4] } @loci; 20 | for my $l1 (@loci) { 21 | my $is_ovl = 0; 22 | for my $l2 (@new_loci) { 23 | $l1->[0] eq $l2->[0] or next; 24 | $l1->[3] eq $l2->[3] or next; 25 | $l1->[1] > $l2->[2] and next; 26 | $l1->[2] < $l2->[1] and next; 27 | $is_ovl = 1; 28 | last; 29 | } 30 | $is_ovl == 0 and push(@new_loci, [@$l1]); 31 | } 32 | print STDOUT join("\t", $ta[0], scalar(@gID)+scalar(@new_loci), @gID, (map { "$_->[0]:$_->[1]-$_->[2]:$_->[3]" } @new_loci))."\n"; 33 | } 34 | 35 | -------------------------------------------------------------------------------- /annot_tools/liftoff_tools/retrieve_QlocSeq_fromBlk.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # [3/28/2022] Retrieve CDS sequences if they have not been predicted yet. 3 | use strict; 4 | use warnings; 5 | use fastaSunhh; 6 | my $fs_obj = fastaSunhh->new(); 7 | 8 | !@ARGV and die "perl $0 genomic.fa output/Qcds.CX.blk.CL > output/Qcds.CX.blk.CL.fa\n"; 9 | 10 | my $fnFas = shift; 11 | my %seq = %{ $fs_obj->save_seq_to_hash( 'faFile' => $fnFas ) }; 12 | for (keys %seq) { $seq{$_}{'seq'} =~ s!\s!!g; $seq{$_}{'len'} = length($seq{$_}{'seq'}); } 13 | 14 | my %h; 15 | while (<>) { 16 | chomp; 17 | my @ta=split(/\t/, $_); 18 | my $ele_id = $ta[0]; 19 | $ele_id eq 'Q_ID' and next; 20 | # $ta[0] =~ m!^C\S:\S+:\d+\-\d+:[+-]$! or die "$_\n"; 21 | defined $h{$ta[0]} and next; 22 | $h{$ele_id} = 1; 23 | my $chr_id = $ta[1]; 24 | my $chr_str = $ta[2]; 25 | my $blks = $ta[3]; 26 | my $cds_seq = ''; 27 | defined $seq{$chr_id} or die "$chr_id\n"; 28 | for my $tb (split(/;/, $blks)) { 29 | $tb =~ m!^(\d+)\,(\d+)$! or die "$tb\n"; 30 | $cds_seq .= substr($seq{$chr_id}{'seq'}, $1-1, $2-$1+1); 31 | } 32 | if ($chr_str eq '-') { 33 | &fastaSunhh::rcSeq(\$cds_seq, 'rc'); 34 | } 35 | print STDOUT ">$ele_id\n$cds_seq\n"; 36 | } 37 | 38 | -------------------------------------------------------------------------------- /annot_tools/maker/rm_maker_fasta.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t and !@ARGV and die "perl $0 r1_maker_wiFa.gff3 > r1_maker_woFa.gff3\n"; 6 | 7 | while (<>) { 8 | m!^\s*#+FASTA\s*$!i and last; 9 | print; 10 | } 11 | 12 | -------------------------------------------------------------------------------- /annot_tools/mkCmd_blast2Nr.sh: -------------------------------------------------------------------------------- 1 | ### Basic functions. 2 | function exe_cmd { 3 | echo "[$(date)][CMD] $1" 4 | eval $1 5 | echo "[$(date)][Rec] Done." 6 | } 7 | 8 | function tsmsg { 9 | echo "[$(date)]$1" 10 | } 11 | 12 | dbFa='/share/nas2/xigua/sunhonghe/database/db_blast/ncbi/nr' 13 | dbTag='toNr' 14 | 15 | qryFa=$1 16 | cpuN=10 17 | 18 | echo "blastp -outfmt 11 -db $dbFa -evalue 1e-3 -num_alignments 20 -seg yes -num_threads $cpuN -query $qryFa -out $qryFa.$dbTag.asn.1" 19 | 20 | # blastp -outfmt 5 -db nr -evalue 1e-3 -num_alignments 20 -seg yes -num_threads 20 -query wcgP_cutted/wcgP_00033.fasta -out wcgP_cutted/wcgP_00033.fasta.xml 21 | 22 | 23 | -------------------------------------------------------------------------------- /annot_tools/pasa/alignAssembly.config: -------------------------------------------------------------------------------- 1 | 2 | ## templated variables to be replaced exist as <__var_name__> 3 | 4 | # MySQL settings 5 | MYSQLDB=P1denovoAndGG_pasa 6 | 7 | 8 | ####################################################### 9 | # Parameters to specify to specific scripts in pipeline 10 | # create a key = "script_name" + ":" + "parameter" 11 | # assign a value as done above. 12 | 13 | #script validate_alignments_in_db.dbi 14 | validate_alignments_in_db.dbi:--MIN_PERCENT_ALIGNED=95 15 | validate_alignments_in_db.dbi:--MIN_AVG_PER_ID=98 16 | validate_alignments_in_db.dbi:--NUM_BP_PERFECT_SPLICE_BOUNDARY=3 17 | validate_alignments_in_db.dbi:--MAX_INTRON_LENGTH=50000 18 | 19 | 20 | #script subcluster_builder.dbi 21 | subcluster_builder.dbi:-m=50 22 | 23 | -------------------------------------------------------------------------------- /annot_tools/pasa/pair_ovlp.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | my %loc; 6 | while (<>) { 7 | chomp; 8 | my @ta = split(/\t/, $_); 9 | push(@{$loc{$ta[1]}}, [@ta[0,2,3]]); 10 | 11 | } 12 | 13 | 14 | for my $k ( sort keys %loc ) { 15 | 16 | my @loc = sort { $a->[1] <=> $b->[1] || $a->[2] <=> $b->[2] } @{$loc{$k}}; 17 | for (my $i=0; $i<@loc; $i++){ 18 | my $has = 0; 19 | for (my $j=$i+1; $j<@loc; $j++) { 20 | if ($loc[$i][2] < $loc[$j][1] ) { 21 | last; 22 | } elsif ( $loc[$i][2] >= $loc[$j][1] and $loc[$i][1] <= $loc[$j][2] ) { 23 | print "$loc[$i][0]\t$loc[$j][0]\n"; 24 | $has = 1; 25 | } else { 26 | die "@{$loc[$i]}\n@{$loc[$j]}\n"; 27 | } 28 | } 29 | if ($has == 0) { 30 | print "$loc[$i][0]\t\n"; 31 | } 32 | } 33 | } 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /annot_tools/protein/trimProt4spaln.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | 6 | !@ARGV and die "perl $0 in_prot.fa out_prot.fa\n"; 7 | 8 | my $fin = shift; 9 | my $fout = shift; 10 | 11 | &exeCmd_1cmd("cat $fin | deal_fasta.pl -rmTailX_prot | deal_fasta.pl -frag 0-0 -frag_width 100 -frag_head | deal_fasta.pl -chopKey ':1\\-\\d+' | deal_fasta.pl -rmDefinition > $fout"); 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/ProtExcluder1.1/Installer.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | $usage = "Installer.pl -m hmmerDIR -p ProtexcluderDIR\n"; 4 | 5 | # to install Protexcluder 6 | 7 | use Getopt::Std; 8 | 9 | getopts("m:p:"); 10 | 11 | $hmmerDIR = defined $opt_m ? $opt_m : ""; 12 | 13 | $prexcDIR = defined $opt_p ? $opt_p : ""; 14 | 15 | @Raw_Files = glob "*.npl"; 16 | foreach(@Raw_Files) { 17 | $NPL = $_; 18 | open(RF, "$NPL")||die"$!\n"; 19 | $PL = $NPL; 20 | $PL =~ s/\.npl/\.pl/; 21 | open(PL, ">$PL")||die"$!\n"; 22 | while() { 23 | chomp; 24 | $Line = $_; 25 | 26 | $Line =~ s/_hmmer_/$hmmerDIR/; 27 | $Line =~ s/_prexc_/$prexcDIR/; 28 | 29 | print(PL "$Line\n"); 30 | } 31 | close(RF); 32 | close(PL); 33 | 34 | system "chmod 755 $PL\n"; 35 | } 36 | 37 | print "Install finished!\n"; 38 | print "If you input the wrong path, you can do it again with corrected paths\n"; 39 | print "--------------------------- Have a nice day! -------------------------\n\n\n"; 40 | 41 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/ProtExcluder1.1/fasta-reformat.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | $usage = "fasta-reformat.pl input-fasta-file number-of-positions-per-line\n"; 4 | 5 | # to reformat fasta file so that each line containing number of letters given by the user 6 | 7 | if (@ARGV < 2) {die $usage;} 8 | if ($ARGV[1] < 1) {die $usage;} 9 | 10 | open(FA, "$ARGV[0]") || die $usage; 11 | 12 | $seq = ""; 13 | while () { 14 | if (/>\s*(.+)/) { 15 | if ($seq) { 16 | @sym = split(//, $seq); 17 | $ct = 0; 18 | foreach $sym (@sym) { 19 | print $sym; 20 | $ct ++; 21 | if ( !($ct%$ARGV[1]) ) {print "\n";} 22 | } 23 | if ($ct%$ARGV[1]) {print "\n";} 24 | } 25 | printf ">%s\n", $1; 26 | $seq = ""; 27 | } else { 28 | chomp; 29 | $seq .= $_; 30 | } 31 | } 32 | close FA; 33 | 34 | @sym = split(//, $seq); 35 | $ct = 0; 36 | foreach $sym (@sym) { 37 | print $sym; 38 | $ct ++; 39 | if ( !($ct%$ARGV[1]) ) {print "\n";} 40 | } 41 | if ($ct%$ARGV[1]) {print "\n";} 42 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/ProtExcluder1.1/getanycolumnuni.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | $usage = "getanycolumnuni.pl file column wanted\n"; 4 | 5 | # to get a list from a given column in a text file 6 | # redundancy is excluded if the same items are next to each other 7 | 8 | if (@ARGV < 2) {die "$usage";} 9 | 10 | open(MSP, "$ARGV[0]") || die "Can not open the input MSP file $ARGV[0]\n$usage"; 11 | 12 | $lquery = ""; 13 | 14 | while () { 15 | @line = split; 16 | $i = $ARGV[1] -1; 17 | if ($line[$i] ne $lquery) { 18 | printf "%s\n",$line[$i]; 19 | } 20 | $lquery = $line[$i]; 21 | } 22 | close MSP; 23 | 24 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/ProtExcluder1.1/matchtract.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | $usage = "matchtract.pl blastx_output_file\n"; 4 | 5 | # to extract matched amino acids in blastx file 6 | 7 | if (@ARGV < 1) {die "$usage";} 8 | if (@ARGV > 1) {$score_cutoff = $ARGV[1];} 9 | else {$score_cutoff = 0;} 10 | if (@ARGV > 2) {$iden_cutoff = $ARGV[2];} 11 | else {$iden_cutoff = 0;} 12 | open(BLT, "$ARGV[0]") || die "Can not open BLAST output $ARGV[0].\n$usage"; 13 | 14 | $score = -1; 15 | while () { 16 | if (/^Query=\s+(\S+)/) { 17 | $query = $1; 18 | } 19 | elsif (/^>\s+(\S+)/) { 20 | $subject = $1; 21 | printf ">%s %s\n",$subject,$query; 22 | } 23 | elsif (/^Query\s+(\S+)/) { 24 | $take = 1; 25 | } 26 | elsif ($take) { 27 | print; 28 | $take = 0; 29 | } 30 | } 31 | 32 | 33 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/ProtExcluder1.1/mergequeryBF.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | $usage = "mergequeryBF.pl BF maximum gap to merge\n"; 4 | 5 | # to merge matched region in query if they are within given distance 6 | 7 | 8 | if (@ARGV < 2) {die "$usage";} 9 | 10 | `sort -k 6,6 -k 3,3n $ARGV[0] > $ARGV[0]s`; 11 | 12 | open(MSP, "$ARGV[0]s") || die $usage; 13 | 14 | while () { 15 | 16 | if (/^\s*\d+\s+\d+\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+\d+\s+\d+\s+\d+\s+(\S+)\s*/) { 17 | if ($4 ne $lTE) { 18 | printf "%-30s %06d %06d %06d %s\n", $lTE, $start,$end,$llen,$lsubj; 19 | $start = $1; 20 | $end = $2; 21 | } 22 | elsif (($1 - $end) > $ARGV[1]) { 23 | printf "%-30s %06d %06d %06d %s\n", $lTE, $start,$end,$llen,$lsubj; 24 | $start = $1; 25 | $end = $2; 26 | } 27 | elsif ($2 > $end) { 28 | $end = $2; 29 | } 30 | $llen = $3; 31 | $lTE = $4; 32 | $lsubj = $5; 33 | } 34 | } 35 | 36 | close MSP; 37 | 38 | printf "%-30s %06d %06d %06d %s\n", $lTE, $start,$end,$llen,$lsubj; 39 | 40 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/ProtExcluder1.1/mergeunmatchedregion.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | $usage = "mergeunmatchedregion.pl seqfile\n"; 4 | 5 | # to merge multiple pieces from a single sequence into one piece 6 | 7 | open(DB, "$ARGV[0]") || die "Can not open the seqfile $ARGV[0]\n$usage"; 8 | 9 | $lTE = ""; 10 | while () { 11 | if (/^>(\S+)\D\d+-\d+\s*(.*)$/){ 12 | if ($1 ne $lTE) { 13 | printf ">%s\t %s\n",$1, $2; 14 | } 15 | $lTE = $1; 16 | } else {print;} 17 | } 18 | 19 | close DB; 20 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/ProtExcluder1.1/mspesl-sfetch.npl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | $usage = "mspesl-sfetch.pl database msp_file integer output_file\nwhere integer is how far away you'd like to extend from the match region\n"; 4 | 5 | # to fetch sequences from a list 6 | 7 | # call esl-sfetch from hmmer-3.1 8 | 9 | if (@ARGV < 4) {die "$usage";} 10 | 11 | open(TEST, "$ARGV[0]") || die "Can not open the database $ARGV[0]\n$usage"; 12 | close TEST; 13 | open(MSP, "$ARGV[1]") || die "Can not open the input MSP file $ARGV[1]\n$usage"; 14 | 15 | `rm -f $ARGV[3]`; 16 | 17 | `_hmmer_binaries/esl-sfetch --index $ARGV[0]`; 18 | 19 | while () { 20 | @line = split; 21 | if ($line[2] < $line[3]) {$from=$line[5]-$ARGV[2]; $to=$line[6]+$ARGV[2];} 22 | else {$from=$line[6]+$ARGV[2]; $to=$line[5]-$ARGV[2];} 23 | if ($from < 1) {$from=1;} 24 | if ($to < 1) {$to=1;} 25 | if ($line[2] < $line[3]) { 26 | `_hmmer_binaries/esl-sfetch -c $from..$to $ARGV[0] $line[7] >> $ARGV[3]`; 27 | } 28 | else { 29 | `_hmmer_binaries/esl-sfetch -c $from..$to -r $ARGV[0] $line[7] >> $ARGV[3]`; 30 | } 31 | 32 | if ($?) {print $_, "failure\n\n"; $failure++;} 33 | } 34 | close MSP; 35 | -f $ARGV[3] or `echo "" > $ARGV[3]`; 36 | 37 | if ($failure) { 38 | print "Total failed case: ", $failure; 39 | } 40 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/ProtExcluder1.1/rmlistedseq.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | $usage = "rmlistedseq.pl namelist fasta file \n"; 4 | 5 | # This is for removing a subset of sequences from a seqfile 6 | 7 | if (@ARGV < 2) {die $usage;} 8 | 9 | open(RM, "$ARGV[0]") || die $usage; 10 | 11 | $ct = 0; 12 | 13 | while () { 14 | if (/^>*(\S+)\s*/) { 15 | $seq{$ct} = $1; 16 | $ct ++; 17 | } 18 | } 19 | close RM; 20 | 21 | open(FA, "$ARGV[1]") || die $usage; 22 | 23 | while () { 24 | if (/^>(\S+)\s*/) { 25 | if (&comparison) { 26 | $take = 0; 27 | } 28 | else { 29 | $take = 1; 30 | } 31 | } 32 | if ($take) { 33 | print; 34 | } 35 | } 36 | 37 | close FA; 38 | 39 | 40 | 41 | 42 | 43 | sub comparison { 44 | foreach $key (keys %seq){ 45 | if ($1 eq $seq{$key}){ 46 | return 1; 47 | } 48 | } 49 | 50 | } 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/ProtExcluder1.1/rmlowcomfromBF.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | my $usage = "rmlowcomfromMSP.pl mtca BFfile\n"; 6 | 7 | if (@ARGV < 2) {die $usage;} 8 | 9 | # This is for finding the intems in file two but not in file one 10 | 11 | open(RM, "$ARGV[0]") || die "Cannot open $ARGV[0]"; 12 | 13 | my $ct = 0; 14 | my ($lpr, $lseq) = ('', ''); 15 | my (%pr, %seq); 16 | while () { 17 | if (/^>(\S+)\s+(\S+)\s*/) { 18 | unless ($1 eq $lpr && $2 eq $lseq) { 19 | $pr{$ct} = $1; 20 | $seq{$ct} = $2; 21 | $ct ++; 22 | } 23 | $lpr = $1; 24 | $lseq = $2; 25 | } 26 | } 27 | 28 | open(MSP, "$ARGV[1]") || die "Cannot open $ARGV[1]"; 29 | while () { 30 | if (/^\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+(\S+)\s+\d+\s+\d+\s+\d+\s+(\S+)\s*/) { 31 | if (&comparison) { 32 | print; 33 | } 34 | } 35 | } 36 | close MSP; 37 | 38 | sub comparison { 39 | foreach my $key (keys %pr){ 40 | if ($1 eq $seq{$key} && $2 eq $pr{$key} ) { return 1; } 41 | } 42 | } 43 | 44 | 45 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/ProtExcluder1.1/rmlowcomplexitymathc.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | $usage = "gcalongenes.pl gc3 file aa number minimum percent\n"; 4 | 5 | 6 | if (@ARGV < 3) {die $usage;} 7 | 8 | 9 | open(MSP, "$ARGV[0]") || die $usage; 10 | 11 | while (){ 12 | @line = split; 13 | $total = 0; 14 | $i = 3; 15 | $j = $i + $ARGV[1]; 16 | while ($i < $j) { 17 | $total = $total + $line[$i]; 18 | $i ++; 19 | } 20 | if ($line[2]) { 21 | $rate = $total*100/$line[2]; 22 | } 23 | if ($rate < $ARGV[2]) { 24 | print; 25 | } 26 | 27 | } 28 | 29 | close MSP; 30 | 31 | 32 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/ProtExcluder1.1/rmshortseq_noN.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | $usage = "rmshortseq.pl stfile fastafile minmusize\n"; 4 | 5 | # to delete sequences short than given size 6 | 7 | if (@ARGV < 3) {die "$usage";} 8 | 9 | open(LENGTH, "$ARGV[0]") || die $usage; 10 | 11 | while (){ 12 | if (/^\s*(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s/) { 13 | $nonN = $2 + $3 + $4 + $5; 14 | if ($nonN >= $ARGV[2]) { 15 | $name = $1; 16 | $len{$name} = $nonN; } 17 | } 18 | } 19 | close LENGTH; 20 | 21 | open(FASTA, "$ARGV[1]") || die $usage; 22 | while () { 23 | 24 | if (/^>(\S+)s*/) { 25 | 26 | if (&comparison){$take = 1;} 27 | else {$take = 0;} 28 | } 29 | if ($take){ 30 | print; 31 | } 32 | } 33 | close FASTA; 34 | 35 | sub comparison { 36 | foreach $key (keys %len){ 37 | if ($key eq $1) 38 | { return 1;} 39 | } 40 | } 41 | 42 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/ProtExcluder1.1/unmatchedregionBF.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | $usage = "unmatchedregionBF.pl BFm50 bpflanking\n"; 4 | 5 | # to extract the unmatched portion of the sequence 6 | 7 | if (@ARGV < 2) {die "$usage";} 8 | 9 | open(BF, "$ARGV[0]") || die $usage; 10 | 11 | while () { 12 | 13 | if (/^(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+\S+\s*/) { 14 | $head = $lend + $ARGV[1]; 15 | if ($1 ne $lTE) { 16 | if ($head < $llen) { 17 | printf "1000 85 1 200 query %06d %06d %s\n", $head, $llen,$lTE; 18 | } 19 | if ($2 > $ARGV[1]) { 20 | printf "1000 85 1 200 query 000001 %06d %s\n", ($2-$ARGV[1]),$1; 21 | } 22 | } 23 | elsif ($head < ($2 - $ARGV[1])) { 24 | printf "1000 85 1 200 query %06d %06d %s\n", $head, ($2-$ARGV[1]),$1; 25 | } 26 | $lTE = $1; 27 | $lend = $3; 28 | $llen = $4; 29 | } 30 | } 31 | 32 | close BF; 33 | 34 | $head = $lend + $ARGV[1]; 35 | if ($head < $llen) { 36 | printf "1000 85 1 200 query %06d %06d %s\n", $head, $llen,$lTE; 37 | } 38 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/add_repClass.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | 6 | !@ARGV and die "perl $0 RepClass in.fa\n"; 7 | 8 | my $tag = shift; 9 | &tsmsg("[Rec] Add [$tag] to repeat name.\n"); 10 | while (<>) { 11 | if (m/^\s*>/) { 12 | s/^>(\S+)/>$1#$tag/ or &stopErr("[Err] $_"); 13 | } 14 | print STDOUT $_; 15 | } 16 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/cmd_list_forFinalRepDB: -------------------------------------------------------------------------------- 1 | cat allLTR.lib.noProtFinal.classified use_MITE.lib.noProtFinal.classified ModelerID.lib.noProtFinal Modelerunknown.lib.noProtFinal | deal_fasta.pl -frag_head -frag 0-0 -frag_width 100 | deal_fasta.pl -chopKey ':\d+\-\d+$' | perl -e ' while (<>) { s!\t! !g; print; } ' > allRepeats_v1.lib 2 | cat allRepeats_v1.lib | deal_fasta.pl -nres 'Unknown$' > KnownRepeats_v1.lib 3 | 4 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/detect_centromere/get_candidate_cent_from_whole.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | !@ARGV and die "perl $0 num_chr min_monomer_len whole/after_clusterin_summary_whole.txt.cntChr > whole/after_clusterin_summary_whole.txt.cntChr.slct\n"; 6 | 7 | my $nChr = shift; 8 | my $min_monomer_len = shift; 9 | 10 | while (<>) { 11 | chomp; 12 | my @ta=split(/\t/, $_); 13 | $ta[0] eq "CL_ID" and next; 14 | $ta[4] == $nChr or next; 15 | $ta[2] >= $min_monomer_len or next; 16 | my @a1 = split(/;/, $ta[5]); 17 | my @a2 = split(/;/, $ta[6]); 18 | for (my $i=0; $i<@a1; $i++) { 19 | my @a3 = split(/\<\*\>/, $a2[$i]); 20 | print STDOUT join("\t", $a2[$i], "$a1[$i]__$ta[0]", $a1[$i], $ta[0], $a3[2], $a3[3])."\n"; 21 | } 22 | } 23 | 24 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/filter_RepMsk_out.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | 6 | !@ARGV and die "perl $0 in_RepMsk.out\n"; 7 | 8 | while (<>) { 9 | unless ( m/^\s*\d+/ ) { 10 | # print; 11 | next; 12 | } 13 | chomp; 14 | s/^\s+//; s/\s+$//; 15 | my @ta = split(/\s+/, $_); 16 | my $name1=$ta[4]; 17 | my $name2=$ta[9]; 18 | $name1 =~ m/^RR\d+_(seq\d+)_(\d+)_(\d+)_INN_(\S+)$/ or die "name1=$name1\n"; 19 | my @nn1 = ($1,$2,$3,$4); 20 | $name2 =~ m/^([^\s:]+):(\d+)\-(\d+):([FR])$/ or die "$name2\n"; 21 | my @nn2 = ($1,$2,$3,$4); 22 | my $a = 0; 23 | if ( $nn1[3] eq $nn2[0] ) { 24 | $nn1[1]-1 == $nn2[2] and $a = 1; 25 | $nn1[2]+1 == $nn2[1] and $a = 1; 26 | } 27 | $a == 1 or print STDOUT "$_\n"; 28 | } 29 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/mk_outID.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | # seqID LTR1_S LTR1_E LTR2_S LTR2_E Inner_S Inner_E PBS_S PBS_E Strand scfID 5 | # seq10 785209 785369 788589 788749 785370 788588 785373 785384 + S400016_pilon 6 | 7 | while (<>) { 8 | chomp; 9 | my @ta = split(/\t/, $_); 10 | print STDOUT join("\t", $_, "$ta[0]_$ta[1]_$ta[4]_$ta[10]")."\n"; 11 | } 12 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/muscle3.8.31_i86linux64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/annot_tools/repAnno_tools/muscle3.8.31_i86linux64 -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/name_from_tab.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | 6 | -t and !@ARGV and die "perl $0 dgt.tab\n"; 7 | 8 | # eleID eleS eleE Str seqID LTR1_S LTR1_E LTR2_S LTR2_E Inner_S Inner_E PBS_S PBS_E PPT_S PPT_E scfID 9 | # RR1 98334 105001 ? seq10 98339 99198 104137 104996 99199 104136 -1 -1 -1 -1 S400016_pilon 10 | # RR2 785204 788754 + seq10 785209 785369 788589 788749 785370 788588 785373 785384 -1 -1 S400016_pilon 11 | 12 | while (<>) { 13 | chomp; 14 | my @ta = split(/\t/, $_); 15 | if ( $ta[0] eq 'eleID' ) { 16 | # print STDOUT "$_\n"; 17 | next; 18 | } 19 | my ($inner_s, $inner_e, $pbs_s, $pbs_e, $ppt_s, $ppt_e) = @ta[9,10, 11,12, 13,14]; 20 | my ($eleID, $seqID, $scfID) = @ta[0,4,15]; 21 | my $tk1 = "${eleID}_${seqID}_$ta[1]_$ta[2]_ELE_$scfID"; # Element region. (with TSD) 22 | my $tk2 = "${eleID}_${seqID}_$ta[5]_$ta[8]_LTR_$scfID"; # LTR region. (without TSD) 23 | my $tk3 = "${eleID}_${seqID}_$ta[9]_$ta[10]_INN_$scfID"; # Internal region. (without ltr region) 24 | print STDOUT "$tk1\t${eleID}_\t${eleID}\n"; 25 | print STDOUT "$tk2\t${eleID}_\t${eleID}\n"; 26 | print STDOUT "$tk3\t${eleID}_\t${eleID}\n"; 27 | } 28 | 29 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/path.conf: -------------------------------------------------------------------------------- 1 | dir2 /home/Sunhh/tools/github/NGS_data_processing 2 | pl_deal_fasta __dir2__/deal_fasta.pl 3 | pl_deal_table __dir2__/deal_table.pl 4 | 5 | dir1 /home/Sunhh/tools/github/NGS_data_processing/repAnno_tools 6 | pl_ch_gff_to_tab __dir1__/ch_gff_to_tab.pl 7 | pl_ch_seqID __dir1__/ch_seqID.pl 8 | pl_filter_tab_byPBSPPT __dir1__/filter_tab_byPBSPPT.pl 9 | pl_name_from_tab __dir1__/name_from_tab.pl 10 | pl_filter_flank __dir1__/filter_flank.pl 11 | pl_filter_RepMsk_out __dir1__/filter_RepMsk_out.pl 12 | pl_build_Examplar_byFa __dir1__/build_Examplar_byFa.pl 13 | pl_lis_masked_RepMsk_out __dir1__/lis_masked_RepMsk_out.pl 14 | pl_get_LTR_wi_Termi __dir1__/get_LTR_wi_Termi.pl 15 | 16 | exe_RepeatMasker /data/Sunhh/src/Annot/repeatmasker/RepeatMasker/RepeatMasker 17 | exe_gt /data/Sunhh/src/Annot/genometools/gt-1.5.3-complete/bin/gt 18 | exe_makeblastdb /usr/local/bin/makeblastdb 19 | exe_blastn /usr/local/bin/blastn 20 | exe_mv /usr/bin/mv 21 | 22 | eu_tRNA /data/Sunhh/P1_repeat/db/eukaryotic-tRNAs.fa 23 | ref_dbLTR_ltr99 LTR99_named.lib 24 | ref_dbLTR_trim99 TRIM99_named.lib 25 | refFa P1Genom_Gt5h.scf.fa 26 | refIdx P1GenomeGt5hScf 27 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/rmOutToGFF3_with_TEclass.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | if (@ARGV < 1) { 6 | die "Usage: $0 \n"; 7 | } 8 | 9 | my $file = $ARGV[0]; 10 | open my $in, '<', $file or die "Cannot open $file: $!\n"; 11 | 12 | while (<$in>) { 13 | next if /^#/; 14 | next unless /^\s*\d+/; 15 | 16 | my @fields = split ' '; 17 | my ($score, $div, $del, $ins, $query, $q_start, $q_end, $q_left, $strand, 18 | $repeat, $class, $r_start, $r_end, $r_left, $id) = @fields; 19 | 20 | if ($strand eq "C") { 21 | $strand = "-"; 22 | ($r_start, $r_end) = ($r_left, $r_end); 23 | } else { 24 | $strand = "+"; 25 | } 26 | my $sim = sprintf("%0.1f", 100-$div); 27 | 28 | # my $attributes = "ID=$id;Target=$repeat $r_start $r_end;Class=$class"; 29 | my $attributes = "Target=$repeat $r_start $r_end;Classification=$class;Identity=$sim"; 30 | my $source = "RepeatMasker"; 31 | my $type = "dispersed_repeat"; 32 | 33 | print join("\t", $query, $source, $type, $q_start, $q_end, $score, $strand, ".", $attributes), "\n"; 34 | } 35 | 36 | close $in; 37 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/run_MITE.sh: -------------------------------------------------------------------------------- 1 | ### Basic functions. 2 | function exe_cmd { 3 | echo "[$(date)][CMD] $1" 4 | eval "$1" 5 | echo "[$(date)][Rec] Done." 6 | } 7 | 8 | function tsmsg { 9 | echo "[$(date)]$1" 10 | } 11 | 12 | pl_dealFa=$HOME/tools/github/NGS_data_processing/deal_fasta.pl 13 | pl_addClass=$HOME/tools/github/NGS_data_processing/repAnno_tools/add_repClass.pl 14 | 15 | pl_MITE=$HOME/src/Annotation/MITE_Hunter/MITE_Hunter_blast216/MITE_Hunter_manager.pl 16 | 17 | refFa='P1All.scf.fa' 18 | outG='P1AllGt5hScf' 19 | cpuN=10 20 | grpN=10 21 | 22 | tsmsg "Start." 23 | 24 | # exe_cmd "mkdir running/ Step8/" 25 | 26 | [ -d "running" ] || exe_cmd "mkdir running" 27 | [ -d "Step8" ] || exe_cmd "mkdir Step8" 28 | 29 | cd running 30 | ln -s ../$refFa . 31 | 32 | exe_cmd "perl $pl_MITE -c $cpuN -n $grpN -S 12345678 -i $refFa -g $outG" 33 | exe_cmd "cp -p *_Step8*.fa ../Step8/" 34 | exe_cmd "cat ${outG}_Step8*.fa > ../MITE_raw.lib" 35 | exe_cmd "perl $pl_dealFa ../MITE_raw.lib -frag_head -frag_width 80 -frag 0-0 | perl $pl_dealFa -chopKey ':\\d+-\\d+\$' | perl $pl_addClass MITE > ../MITE_named.lib" 36 | 37 | cd ../ 38 | 39 | tsmsg "All done." 40 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/run_repClass_ltr.sh: -------------------------------------------------------------------------------- 1 | ### Basic functions. 2 | function exe_cmd { 3 | echo "[$(date)][CMD] $1" 4 | eval $1 5 | echo "[$(date)][Rec] Done." 6 | } 7 | 8 | function tsmsg { 9 | echo "[$(date)]$1" 10 | } 11 | 12 | exe_cmd "/share/app/Annotation/repeatmodeler/RepeatModeler/RepeatClassifier -consensi allLTR_rmGen_chop.fa -engine ncbi" 13 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/run_repClass_mite.sh: -------------------------------------------------------------------------------- 1 | ### Basic functions. 2 | function exe_cmd { 3 | echo "[$(date)][CMD] $1" 4 | eval $1 5 | echo "[$(date)][Rec] Done." 6 | } 7 | 8 | function tsmsg { 9 | echo "[$(date)]$1" 10 | } 11 | 12 | exe_cmd "/share/app/Annotation/repeatmodeler/RepeatModeler/RepeatClassifier -consensi MITE_rmGen_chop.fa -engine ncbi" 13 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/run_repeatmasker.sh: -------------------------------------------------------------------------------- 1 | ### Basic functions. 2 | function exe_cmd { 3 | echo "[$(date)][CMD] $1" 4 | eval $1 5 | echo "[$(date)][Rec] Done." 6 | } 7 | 8 | function tsmsg { 9 | echo "[$(date)]$1" 10 | } 11 | 12 | exe_rpmk="/data/Sunhh/src/annotation/repeatmasker/RepeatMasker/RepeatMasker" 13 | pl_buildSum="/data/Sunhh/src/annotation/repeatmasker/RepeatMasker/util/buildSummary.pl" 14 | 15 | cpuN=40 16 | repLib="D202306.TElib.fa" 17 | seqfile="hap1.fa" 18 | tsv_genom="$seqfile.tsv" 19 | 20 | tsmsg "[Rec] All start." 21 | 22 | deal_fasta.pl -baseCount $seqfile | awk 'NR > 1 {print $1"\t"$8-$6}' > $tsv_genom 23 | 24 | exe_cmd "$exe_rpmk -s -x -lib $repLib $seqfile -nolow -norna -no_is -pa $cpuN -a 1>$seqfile.stdout_RepMsk 2>$seqfile.stderr_RepMsk" 25 | exe_cmd "perl $pl_buildSum -useAbsoluteGenomeSize -genome $tsv_genom $seqfile.out > $seqfile.out.NuclGenom.summary" 26 | 27 | tsmsg "[Rec] All done." 28 | 29 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/run_rm_GF.sh: -------------------------------------------------------------------------------- 1 | ### Basic functions. 2 | function exe_cmd { 3 | echo "[$(date)][CMD] $1" 4 | eval "$1" 5 | echo "[$(date)][Rec] Done." 6 | } 7 | 8 | function tsmsg { 9 | echo "[$(date)]$1" 10 | } 11 | 12 | pl_GF="$HOME/tools/github/NGS_data_processing/repAnno_tools/rm_geneFrag.pl" 13 | pl_PE="/data/Sunhh/P1_annot/01.P1_repeat/05.rmGeneFrag/tools/ProtExcluder1.1/ProtExcluder.pl" 14 | pl_dealFa="$HOME//tools/github/NGS_data_processing/deal_fasta.pl" 15 | 16 | dbProt='uniprot_sprot_plants_rmTransProt.fa' 17 | dbProt='/data/Sunhh/database/db_fasta/uniprot/20140917/uniprot_sprot_plants_rmTransProt.fa' 18 | 19 | inLibLis="inLibLis" 20 | cpuN=30 21 | 22 | tsmsg "[Rec] Start" 23 | 24 | exe_cmd "perl $pl_GF -evalue 1e-2 -rawLibLis $inLibLis -cpuN $cpuN -dbProt $dbProt -pl_ProtExcluder $pl_PE -pl_dealFa $pl_dealFa" 25 | 26 | tsmsg "[Rec] All done." 27 | 28 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/run_rpmd.sh: -------------------------------------------------------------------------------- 1 | ### Basic functions. 2 | function exe_cmd { 3 | echo "[$(date)][CMD] $1" 4 | eval "$1" 5 | echo "[$(date)][Rec] Done." 6 | } 7 | 8 | function tsmsg { 9 | echo "[$(date)]$1" 10 | } 11 | 12 | 13 | dir_rpmd='/data/Sunhh/src/Annot/repeatmodeler/RepeatModeler' 14 | exe_rpmd="$dir_rpmd/RepeatModeler" 15 | exe_rpmdBD="$dir_rpmd/BuildDatabase" 16 | exe_rpmk='/data/Sunhh/src/Annot/repeatmasker/RepeatMasker/RepeatMasker' 17 | 18 | pl_dealFa='/home/Sunhh/tools/github/NGS_data_processing/deal_fasta.pl' 19 | 20 | refFa='P3Genom_Gt5h.scf.fa' 21 | repDb='all_LTR_MITE.lib' 22 | 23 | exe_cmd "$exe_rpmk -lib $repDb $refFa -x -nolow -norna -no_is -pa 40 -a 1>stdout.RepMsk 2>stderr.RepMsk" 24 | exe_cmd "perl $pl_dealFa -listSite '[ATGCNatgcn]+' $refFa.masked > $refFa.um_list" 25 | exe_cmd "perl $pl_dealFa $refFa -drawByList -drawList $refFa.um_list -drawLcol 0,2,3 > $refFa.um" 26 | 27 | exe_cmd "$exe_rpmdBD -name um -engine ncbi $refFa.um" 28 | exe_cmd "$exe_rpmd -database um -pa 40 " 29 | 30 | -------------------------------------------------------------------------------- /annot_tools/repAnno_tools/seqID_to_scaf.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | my %h; 6 | print STDERR join("\t", qw/seqID scfID/)."\n"; 7 | while (<>) { 8 | if ( m/^>/ ) { 9 | m/^>(\S+) .*\(dbseq\-nr (\d+)\) \[(\d+),(\d+)\]$/ or die "$_\n"; 10 | my $seqID = "seq$2"; 11 | my $scfID = "$1"; 12 | my ($eleS, $eleE) = ($3,$4); 13 | $_ = ">${seqID}_${eleS}_${eleE}_$scfID\n"; 14 | if ( defined $h{$seqID} ) { 15 | $h{$seqID} eq $scfID or die "$h{$seqID} eq $scfID\n$_\n"; 16 | } else { 17 | print STDERR join("\t", $seqID, $scfID)."\n"; 18 | $h{$seqID} = $scfID; 19 | } 20 | } 21 | print; 22 | } 23 | -------------------------------------------------------------------------------- /annot_tools/replace_blast_asn_db.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # 20190717 : Replace database file in .asn.1 files to use blast_formatter. 3 | use strict; 4 | use warnings; 5 | 6 | !@ARGV and die "perl $0 new_db_path input_raw.asn > new.asn\n"; 7 | 8 | my $new_db = shift; 9 | 10 | while (<>) { 11 | if ( m!^\s+subject database \"! ) { 12 | s!^(\s+subject database \").+\"!$1${new_db}"!o or die "Failed at line: $_\n"; 13 | } 14 | print ; 15 | } 16 | -------------------------------------------------------------------------------- /annot_tools/run_maker.sh: -------------------------------------------------------------------------------- 1 | ### Basic functions. 2 | function exe_cmd { 3 | echo "[$(date)][CMD] $1" 4 | eval $1 5 | echo "[$(date)][Rec] Done." 6 | } 7 | 8 | function tsmsg { 9 | echo "[$(date)]$1" 10 | } 11 | 12 | cpuN=50 13 | ctl_bopts="maker_bopts.ctl" 14 | ctl_exe="maker_exe.ctl" 15 | 16 | roundN=2 17 | ctl_opts="maker_opts.r${roundN}.ctl" 18 | exe_cmd "mpiexec -n $cpuN maker $ctl_opts $ctl_bopts $ctl_exe 1>maker.${roundN}-log 2>maker.${roundN}-err" 19 | 20 | #roundN=3 21 | #ctl_opts="maker_opts.r${roundN}.ctl" 22 | #exe_cmd "mpiexec -n $cpuN maker $ctl_opts $ctl_bopts $ctl_exe 1>maker.${roundN}-log 2>maker.${roundN}-err" 23 | 24 | # Get result. 25 | dir_maker='PG1All_v2_Scf.unmsk.maker.output' 26 | db_idx='PG1All_v2_Scf.unmsk_master_datastore_index.log' 27 | path_gffMerge=gff3_merge 28 | path_fasMerge=fasta_merge 29 | 30 | cd $dir_maker/ 31 | exe_cmd "mkdir ../result/r$roundN" 32 | exe_cmd "$path_gffMerge -d $db_idx -g -o ../result/r${roundN}/r${roundN}_maker.gff3" 33 | exe_cmd "$path_gffMerge -d $db_idx -n -o ../result/r${roundN}/r${roundN}_all.gff3" 34 | exe_cmd "$path_fasMerge -d $db_idx -o ../result/r${roundN}/r${roundN}" 35 | cd ../ 36 | 37 | # ipr_update_gff 38 | # https://groups.google.com/forum/#!msg/maker-devel/VaoXWlGHOjs/kbh0YDl1b5gJ 39 | 40 | # GlimmerHMM 41 | # http://ccb.jhu.edu/software/glimmerhmm/man.shtml#spec_org 42 | 43 | -------------------------------------------------------------------------------- /annot_tools/satisfied_prot.sh: -------------------------------------------------------------------------------- 1 | ### Basic functions. 2 | function exe_cmd { 3 | echo "[$(date)][CMD] $1" 4 | eval "$1" 5 | echo "[$(date)][Rec] Done." 6 | } 7 | 8 | function tsmsg { 9 | echo "[$(date)]$1" 10 | } 11 | 12 | pl_bpTrans=$HOME/tools/github/NGS_data_processing/bp0_2_bp6.pl 13 | pl_slctGff=$HOME/tools/github/NGS_data_processing/annot_tools/slct_maker_gff3.pl 14 | 15 | dbPath=prot_db/uniprot_sprot_plants.fas 16 | dbTag="Sprot" 17 | cpuN=50 18 | 19 | maxDist2Edge=9 20 | 21 | rN=2 22 | 23 | inProtFa="r${rN}_maker.prot.fa" 24 | inGff="r${rN}_maker.gff3" 25 | outGff="r${rN}_maker_good.gff3" 26 | 27 | exe_cmd "blastp -evalue 1e-10 -query $inProtFa -db $dbPath -num_threads $cpuN -out ${inProtFa}.to${dbTag}.bp0" 28 | exe_cmd "perl $pl_bpTrans -in ${inProtFa}.to${dbTag}.bp0 -out ${inProtFa}.to${dbTag}.bp6" 29 | exe_cmd "awk ' \$7 <= $maxDist2Edge+1 && \$9 <= $maxDist2Edge+1 && \$10 >= \$14-$maxDist2Edge && \$8 >= \$13-$maxDist2Edge && \$3 >= 60 ' ${inProtFa}.to${dbTag}.bp6 > ${inProtFa}.to${dbTag}.bp6.good" 30 | exe_cmd "perl $pl_slctGff ${inProtFa}.to${dbTag}.bp6.good $inGff > $outGff" 31 | 32 | 33 | -------------------------------------------------------------------------------- /annot_tools/simplify_gff3.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t and !@ARGV and die "perl $0 in.gff3 > simple_IDs.gff3\n"; 6 | 7 | while (<>) { 8 | m!^\s*(#|$)! and do { print; next; }; 9 | chomp; 10 | my @ta=split(/\t/, $_); 11 | if ($ta[2] =~ m!^gene$!i) { 12 | $ta[8] =~ m!(?:^|\s|;)ID=([^\s;]+)! or die "$_\n"; 13 | $ta[8] = "ID=$1"; 14 | } elsif ($ta[2] =~ m!^mRNA$!i) { 15 | my ($mid, $gid); 16 | $ta[8] =~ m!(?:^|\s|;)ID=([^\s;]+)! or die "$_\n"; 17 | $mid = $1; 18 | $ta[8] =~ m!(?:^|\s|;)Parent=([^\s;]+)!i and $gid = $1; 19 | if (defined $gid) { 20 | $ta[8] = "ID=$mid;Parent=$gid"; 21 | } else { 22 | $ta[8] = "ID=$mid"; 23 | } 24 | } elsif ($ta[2] =~ m!^CDS|exon$!i) { 25 | $ta[8] =~ m!(?:^|\s|;)Parent=([^\s;]+)!i or die "$_\n"; 26 | $ta[8] = "Parent=$1"; 27 | } else { 28 | die "[Err] Unknown feature [$ta[2]]\n"; 29 | } 30 | print join("\t", @ta)."\n"; 31 | } 32 | #21QDX551_Chr02 EVM gene 77439 81626 . + . ID=21QDX551C02G000010;Name=EVM%20prediction%2021QDX551_Chr02.1 33 | #21QDX551_Chr02 EVM mRNA 77439 81626 . + . ID=21QDX551C02G000010.1;Parent=21QDX551C02G000010;Name=EVM%20prediction%2021QDX551_Chr02.1 34 | #21QDX551_Chr02 EVM exon 77439 77768 . + . ID=evm.model.21QDX551_Chr02.1.exon1;Parent=21QDX551C02G000010.1 35 | 36 | -------------------------------------------------------------------------------- /annot_tools/tRNA/stat_trnaFreq.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | 6 | -t and !@ARGV and die "perl $0 arab.chr.fa.trnascan.o.slct > arab.chr.fa.trnascan.o.slct.stat\n"; 7 | 8 | # [Sunhh@bioinfor01 work]$ head -4 arab.chr.fa.trnascan.o.slct 9 | # trna_1 1 + 306384 306456 Val TAC ggtgctgtggtgtagtggttatcacgtttgccttacacgcaaaaggtctccagttcgatcctgggcagcacca 10 | # trna_2 1 + 515494 515566 Phe GAA gcggggatagctcagttgggagagcgtcagactgaagatctgaaggtcgcgtgttcgatccacgctcaccgca 11 | # trna_3 1 + 552640 552711 His GTG gtggctgtagtttagtggtaagaattccacgttgtggccgtggagacctgggctcgaatcccagcagccaca 12 | # trna_4 1 + 604402 604474 Lys CTT gcccgtctagctcagttggtagagcgcaaggctcttaaccttgtggtcgtgggttcgagccccacggtgggcg 13 | 14 | my %h; 15 | while (<>) { 16 | chomp; 17 | my @ta = split(/\t/, $_); 18 | $h{$ta[5]}{$ta[6]}++; 19 | } 20 | for my $aa ( sort keys %h ) { 21 | my $sum_gene = 0; 22 | for my $cc ( sort keys %{$h{$aa}} ) { 23 | $sum_gene += $h{$aa}{$cc}; 24 | } 25 | } 26 | 27 | -------------------------------------------------------------------------------- /annot_tools/trim_orphan_right_brack.pl: -------------------------------------------------------------------------------- 1 | /home/Sunhh/tools/github/NGS_data_processing/annot_tools/ahrd/trim_orphan_right_brack.pl -------------------------------------------------------------------------------- /assemble_tools/LAI/pepper_genome.fa.chrID: -------------------------------------------------------------------------------- 1 | Chr01 2 | Chr02 3 | Chr03 4 | Chr04 5 | Chr05 6 | Chr06 7 | Chr07 8 | Chr08 9 | Chr09 10 | Chr10 11 | Chr11 12 | Chr12 13 | -------------------------------------------------------------------------------- /assemble_tools/LAI/pepper_genome.fa.out.LAI_unlock.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/assemble_tools/LAI/pepper_genome.fa.out.LAI_unlock.gz -------------------------------------------------------------------------------- /assemble_tools/LAI/pepper_genome.fa.out.LAI_unlock_distr_byChr.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/assemble_tools/LAI/pepper_genome.fa.out.LAI_unlock_distr_byChr.pdf -------------------------------------------------------------------------------- /assemble_tools/LAI/scrn.para_LAI.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/assemble_tools/LAI/scrn.para_LAI.gz -------------------------------------------------------------------------------- /assemble_tools/add_1bp_ctg_toAGP_jcviPlot.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use Getopt::Long; 5 | my %opts; 6 | GetOptions(\%opts, 7 | "help!", 8 | "gapLen:i", # 1000 9 | ); 10 | $opts{'gapLen'} //= 1000; 11 | 12 | -t and !@ARGV and die "perl $0 -gapLen 1000 in.agp > o.agp\n"; 13 | 14 | my %aa; 15 | while (<>) { 16 | chomp; 17 | my @ta = split(/\t/, $_); 18 | $aa{$ta[0]}{'ord'} //= $.; 19 | push(@{$aa{$ta[0]}{'arr'}}, [@ta]); 20 | } 21 | for my $k1 (sort { $aa{$a}{'ord'} <=> $aa{$b}{'ord'} } keys %aa) { 22 | my @ta2; 23 | for my $t1 (@{$aa{$k1}{'arr'}}) { 24 | print join("\t", @$t1)."\n"; 25 | @ta2 = @$t1; 26 | } 27 | print join("\t", $ta2[0], $ta2[2]+1, $ta2[2]+$opts{'gapLen'}, $ta2[3]+1, 'N', $opts{'gapLen'}, 'scaffold', 'yes', 'map')."\n"; 28 | print join("\t", $ta2[0], $ta2[2]+$opts{'gapLen'}+1, $ta2[2]+$opts{'gapLen'}+1, $ta2[3]+2, 'W', 'NA', 1,1, '+')."\n"; 29 | } 30 | -------------------------------------------------------------------------------- /assemble_tools/add_tag_to_fsa.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | !@ARGV and die "perl $0 tag in.fa\n"; 6 | my $tag = shift; 7 | 8 | while (<>) { 9 | if (m/^>(\S+)/) { 10 | $_ = ">$tag.$1 $tag\n"; 11 | } 12 | print; 13 | } 14 | -------------------------------------------------------------------------------- /assemble_tools/bionano/rename_xmap.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | @ARGV >= 2 or die "perl $0 raw.xmap new.xmap\n"; 5 | my $fn = shift; 6 | my $new_fn = shift; 7 | $fn =~ s/\.xmap$//; 8 | $new_fn =~ s/\.xmap$//; 9 | 10 | open F,'<',"${fn}.xmap" or die; 11 | open O,'>',"${new_fn}.xmap" or die; 12 | while () { 13 | if (m/^#/) { 14 | s/^(# Reference Maps From:\s*)\S+\.cmap/${1}${new_fn}_r.cmap/; 15 | s/^(# Query Maps From:\s*)\S+\.cmap/${1}${new_fn}_q.cmap/; 16 | print O; 17 | } else { 18 | print O; 19 | } 20 | } 21 | close F; 22 | system "rm ${fn}.xmap"; 23 | system "mv ${fn}_q.cmap ${new_fn}_q.cmap"; 24 | system "mv ${fn}_r.cmap ${new_fn}_r.cmap"; 25 | 26 | -------------------------------------------------------------------------------- /assemble_tools/busco/geneCopyN_busco_full_table.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | !@ARGV and -t and die "perl $0 full_table.tsv > full_table.tsv.cnt\n"; 6 | 7 | my %h; 8 | while (<>) { 9 | m!^\s*#! and next; 10 | m!^\s*$! and next; 11 | chomp; 12 | my @ta=split(/\t/, $_); 13 | if ($ta[1] eq "Complete" or $ta[1] eq "Duplicated") { 14 | $h{"complete"}{$ta[0]} ++; 15 | } elsif ($ta[1] eq "Fragmented") { 16 | $h{"fragmented"}{$ta[0]} ++; 17 | } elsif ($ta[1] eq "Missing") { 18 | $h{"missing"}{$ta[0]} ++; 19 | } else { 20 | die "unknown tag [$ta[1]]\n"; 21 | } 22 | } 23 | for my $k1 (qw/complete fragmented missing/) { 24 | $h{$k1} //= {}; 25 | for my $k2 (sort keys %{$h{$k1}}) { 26 | print join("\t", $k2, $k1, $h{$k1}{$k2})."\n"; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /assemble_tools/busco/rm_busco_intermediate_files.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # [8/9/2022] Compress .log files. 3 | use strict; 4 | use warnings; 5 | use LogInforSunhh; 6 | use fileSunhh; 7 | 8 | !@ARGV and die "perl $0 busco_output_dir_toRM/\n"; 9 | 10 | for my $dd (@ARGV) { 11 | opendir my $dir, "$dd" or die; 12 | my @files = grep { $_ !~ m!^\.+$! } readdir($dir); 13 | closedir($dir); 14 | for my $f1 (@files) { 15 | if ($f1 =~ m!^run_\S+_odb\d+$!) { 16 | -e "$dd/$f1/hmmer_output/" and &fileSunhh::_rmtree("$dd/$f1/hmmer_output/"); 17 | -e "$dd/$f1/busco_sequences/" and &fileSunhh::_rmtree("$dd/$f1/busco_sequences/"); 18 | -e "$dd/$f1/metaeuk_output/" and &fileSunhh::_rmtree("$dd/$f1/metaeuk_output/"); 19 | } 20 | } 21 | for my $a1 (qw/busco hmmsearch_out metaeuk_out hmmsearch_err metaeuk_err/) { 22 | -e "$dd/logs/$a1.log" and &runCmd("bgzip -@ 10 $dd/logs/$a1.log"); 23 | } 24 | } 25 | 26 | # rm -rf prot_*/run_embryophyta_odb10/hmmer_output/ 27 | # rm -rf prot_*/run_embryophyta_odb10/busco_sequences/ 28 | # rm -rf genom_*/run_embryophyta_odb10/hmmer_output/ 29 | # rm -rf genom_*/run_embryophyta_odb10/busco_sequences/ 30 | # rm -rf genom_*/run_embryophyta_odb10/metaeuk_output/ 31 | # gzip genom_*/logs/*.log 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /assemble_tools/classify_tools/cnt_In_bp.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | 6 | my %inCnt; 7 | my %ord; 8 | while (<>) { 9 | chomp; 10 | my @ta=split(/\t/, $_); 11 | $ta[0] eq 'qseqid' and next; 12 | $ta[6] =~ m!In:([\.\d]+)! or next; 13 | my $inDep = $1; 14 | $inDep > 0 or next; 15 | $inCnt{$ta[0]} += $ta[4]; 16 | $ord{$ta[0]} //= $.; 17 | } 18 | for my $k1 (sort { $ord{$a} <=> $ord{$b} } keys %ord) { 19 | print join("\t", $)."\n"; 20 | } 21 | 22 | 23 | 24 | # Sunhh@swift:/data/Sunhh/cmaxima/rmcont$ head -4 hf2.noRed.tochk.fa.toNt.bn6.jnInEx 25 | #qseqid qlen qstart qend qspan KingdomCounts InExcludeCounts 26 | #ptg000012l 137195 1 7424 7424 rDNA:5.28 Ex:5.28 27 | #ptg000012l 137195 7425 7472 48 Un:0.00 In:0.00 28 | #ptg000012l 137195 7473 18297 10825 rDNA:16.51 Ex:16.51 29 | 30 | -------------------------------------------------------------------------------- /assemble_tools/classify_tools/line_bn6_query.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | !@ARGV and -t and die "perl $0 in.bn6 > in.bn6.1query1line\n"; 6 | 7 | my @bn6_colName = qw/qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen sstrand staxids sscinames sskingdoms stitle/; 8 | my @col_want = (1,2,3,10,11,17,16,18); 9 | my $topN = 5; 10 | my @txt_want = (@bn6_colName[@col_want]) x $topN; 11 | print STDOUT join("\t", "qseqid", @txt_want)."\n"; 12 | 13 | 14 | 15 | my (%h, @qIDs); 16 | while (<>) { 17 | chomp; 18 | my @ta=split(/\t/, $_); 19 | defined $h{$ta[0]} or push(@qIDs, $ta[0]); 20 | push(@{$h{$ta[0]}}, [$ta[11], [@ta[@col_want]]]); 21 | } 22 | for my $qid (@qIDs) { 23 | my @out_line = ($qid); 24 | @{$h{$qid}} = sort { $b->[0] <=> $a->[0] } @{$h{$qid}}; 25 | my $i=0; 26 | for my $t1 (@{$h{$qid}}) { 27 | $i < $topN or last; 28 | push(@out_line, @{$t1->[1]}); 29 | $i++; 30 | } 31 | print STDOUT join("\t", @out_line)."\n"; 32 | } 33 | 34 | -------------------------------------------------------------------------------- /assemble_tools/cmd_batch_for_mugsy: -------------------------------------------------------------------------------- 1 | perl run_mugsy_MP.pl -cpuN 1 -seqPerBatch 1000 -outfile S1_syn_batch -inmaf S1.fitMugsy.maf -infas S1_all.fitMugsy.fa -printCmd 2 | nohup run_cmd_in_batch.pl tmp1 -cpuN 40 >> scrn.cmd_batch 3 | -------------------------------------------------------------------------------- /assemble_tools/cnvt_quast_unaligned_info_to_tbl.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t and !@ARGV and die "perl $0 ref_hc0/contigs_reports/contigs_report_hf2-noRed.unaligned.info | less -S\n"; 6 | 7 | 8 | print join("\t", qw/Contig Total_length Unaligned_length Unaligned_type unAln_start unAln_end segment_len/)."\n"; 9 | while (<>) { 10 | chomp; 11 | my @ta=split(/\t/, $_); 12 | $ta[1] eq 'Total_length' and next; 13 | my @tb = split(/,/, $ta[4]); 14 | for my $tc (@tb) { 15 | $tc =~ m!^(\d+)\-(\d+)$! or die "$tc in $_\n"; 16 | my ($s,$e) = ($1, $2); 17 | print join("\t", @ta[0,1,2,3], $s, $e, $e-$s+1)."\n"; 18 | } 19 | } 20 | 21 | -------------------------------------------------------------------------------- /assemble_tools/get_paired_maf.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use ReadInAlnSunhh; 5 | use Getopt::Long; 6 | 7 | my %opts; 8 | GetOptions(\%opts, 9 | "help!", 10 | "out:s", 11 | ); 12 | 13 | -t and !@ARGV and &usage(); 14 | $opts{help} and &usage(); 15 | 16 | sub usage { 17 | print STDOUT <',"$opts{out}" or die "Failed to open $opts{out}\n$!\n"; 29 | $oFh = $tfh; 30 | } 31 | 32 | my @FH; 33 | !(-t) and push(@FH, \*STDIN); 34 | for (@ARGV) { 35 | my $fh; 36 | open $fh, '<', "$_" or die; 37 | push(@FH, $fh); 38 | } 39 | 40 | print {$oFh} "##maf version=1\n"; 41 | my @all_blks; 42 | for my $fh (@FH) { 43 | while ( my %rec1 = %{readMAF($fh)} ) { 44 | chomp( $rec1{a}[0] ); 45 | my @lines; 46 | for my $tline (@{$rec1{o}}) { 47 | $tline =~ m/^s\s/ or next; 48 | chomp($tline); 49 | push(@lines, $tline); 50 | } 51 | scalar(@lines) >= 2 or next; 52 | print {$oFh} $rec1{a}[0] . "\n"; 53 | for my $tline (@lines) { 54 | print {$oFh} $tline . "\n"; 55 | } 56 | print {$oFh} "\n"; 57 | } 58 | } 59 | 60 | 61 | -------------------------------------------------------------------------------- /assemble_tools/get_rep_loc_fromPileup.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | 6 | # scaffold10 1 A 1 ^]. ? 7 | # scaffold10 2 A 1 . ? 8 | # scaffold10 3 T 1 . ? 9 | # scaffold10 4 A 1 . ? 10 | 11 | my $maxDepth = 115 * 2; 12 | my $maxGap = 1000; 13 | my %block; 14 | my @ids; 15 | while (<>) { 16 | chomp; 17 | my @ta = split(/\t/, $_); 18 | if ( $ta[3] > $maxDepth ) { 19 | if ( defined $block{$ta[0]} ) { 20 | if ( $block{$ta[0]}[-1][1]+1+$maxGap >= $ta[1] ) { 21 | $block{$ta[0]}[-1][1] = $ta[1]; 22 | } else { 23 | push(@{$block{$ta[0]}}, [$ta[1], $ta[1]]); 24 | } 25 | } else { 26 | push(@ids, $ta[0]); 27 | push(@{$block{$ta[0]}}, [$ta[1], $ta[1]]); 28 | } 29 | } 30 | } 31 | 32 | for (@ids) { 33 | for my $tr1 ( @{$block{$_}} ) { 34 | print STDOUT join("\t", $_, $tr1->[0], $tr1->[1])."\n"; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /assemble_tools/hifi_hic/cnvt_gfa2fa.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Convert hifiasm.p_ctg.gfa file to fasta file. 3 | use strict; 4 | use warnings; 5 | # awk '/^S/{print ">"$2;print $3}' test.p_ctg.gfa > test.p_ctg.fa # get primary contigs in FASTA 6 | !@ARGV and -t and die "perl $0 hifiasm_asm.p_ctg.gfa > hifiasm_asm.p_ctg.fa\n# Used to get the primary contigs in FASTA format.\n\n"; 7 | 8 | while (<>) { 9 | chomp; 10 | m!^S\s+(\S+)\s+(\S+)! or next; 11 | print ">$1\n$2\n"; 12 | } 13 | 14 | -------------------------------------------------------------------------------- /assemble_tools/hifi_hic/cnvt_num2tigID.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | !@ARGV and die "perl $0 example_tigID in_num_file > out_tigID_file\n"; 6 | 7 | my $sample = shift; 8 | 9 | $sample =~ m!^(tig\d+)$! or die "bad example tigID [$sample]\n"; 10 | 11 | my $nLen = length($sample) - 3; 12 | 13 | while (<>) { 14 | chomp; 15 | my @ta=split(/\t/, $_); 16 | if (m!^\s*(#|$)!) { 17 | print "$ta[0]\n"; 18 | next; 19 | } 20 | $ta[0] =~ m!^\d+$! or die "bad number [$ta[0]]\n"; 21 | my $newID = sprintf("%0${nLen}d", $ta[0]); 22 | $newID = "tig$newID"; 23 | print "$newID\n"; 24 | } 25 | 26 | -------------------------------------------------------------------------------- /assemble_tools/hifi_hic/get_HiCanu_ctg.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Get primary contigs from HiCanu assemblies by removing bubble elements. 3 | 4 | use strict; 5 | use warnings; 6 | use fastaSunhh; 7 | my $fs_obj = fastaSunhh->new(); 8 | use fileSunhh; 9 | 10 | -t and !@ARGV and die "perl $0 HiCanu_asm.contigs.fasta HiCanu_asm.contigs.layout.tigInfo > HiCanu_asm_ctg.fa\n"; 11 | 12 | my $f1 = shift; 13 | my $f2 = shift; 14 | 15 | my %faSeq = %{ $fs_obj->save_seq_to_hash('faFile'=>$f1) }; 16 | my @ids = sort { $faSeq{$a}{'Order'} <=> $faSeq{$b}{'Order'} } keys %faSeq; 17 | 18 | 19 | $ids[0] =~ m!^(tig\d+)$! or die "bad example tigID [$ids[0]]\n"; 20 | my $nLen = length($ids[0]) - 3; 21 | 22 | my $ofh2 = &openFH($f2); 23 | 24 | while (<$ofh2>) { 25 | chomp; 26 | my @ta=split(/\t/, $_); 27 | if (m!^\s*(#|$)!) { 28 | # print "$ta[0]\n"; 29 | next; 30 | } 31 | $ta[3] eq 'contig' or next; 32 | $ta[5] eq 'no' or next; 33 | $ta[0] =~ m!^\d+$! or die "bad number [$ta[0]]\n"; 34 | my $newID = sprintf("%0${nLen}d", $ta[0]); 35 | $newID = "tig$newID"; 36 | defined $faSeq{$newID} or die "no seq for ID [$newID]: $_\n"; 37 | print STDOUT ">$faSeq{$newID}{'head'}\n$faSeq{$newID}{'seq'}\n"; 38 | } 39 | close($ofh2); 40 | 41 | 42 | -------------------------------------------------------------------------------- /assemble_tools/kmer/get_kmer_by_seq_summary.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use fileSunhh; 5 | use LogInforSunhh; 6 | 7 | -t and !@ARGV and die "perl $0 R01/ct_R01.scafSeq.k27.counts > R01/ct_R01.scafSeq.k27.counts.tbl\n"; 8 | 9 | my (%seqs, $k, @klist); 10 | while (<>) { 11 | chomp; 12 | if (m!^\>!) { 13 | m!^\>(\S+)! or die "$_\n"; 14 | $k = $1; 15 | defined $seqs{$k} and die "repeated k $k\n"; 16 | push(@klist, $k); 17 | next; 18 | } 19 | s!^\s+!!; 20 | s!\s+$!!; 21 | $seqs{$k} .= " $_"; 22 | } 23 | my $wdir = &fileSunhh::new_tmp_dir('create'=>1); 24 | 25 | my @ostat = qw/interval_mean interval_median interval_stdev MEAN MEDIAN MIN MAX NoNull/; 26 | print STDOUT join("\t", qw/SeqID/, @ostat)."\n"; 27 | for my $v1 (@klist) { 28 | if (!defined $seqs{$v1}) { 29 | print STDOUT join("\t", $v1, (('0') x scalar(@ostat)))."\n"; 30 | next; 31 | } 32 | # &tsmsg("[Msg] Calculating for [$v1]\n"); 33 | $seqs{$v1} =~ s!^\s+!!; 34 | $seqs{$v1} =~ s!\s+!\n!g; 35 | &fileSunhh::write2file("$wdir/nn", "$seqs{$v1}\n",'>'); 36 | my %vstat = map { 37 | chomp($_); 38 | split(/\t/, $_); 39 | } `deal_table.pl $wdir/nn -col_stat 0 -col_stat_AsINS | deal_table.pl -transpose `; 40 | print STDOUT join("\t", $v1, @vstat{@ostat})."\n"; 41 | } 42 | 43 | &fileSunhh::_rmtree($wdir); 44 | 45 | -------------------------------------------------------------------------------- /assemble_tools/slct_pe.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | my @points_slct = 30e3; 6 | my $max_allowed = 280e3; 7 | my $min_allowed = 70e3; 8 | # For PE points 9 | @points_slct = (); 10 | for (my $i=5e3; $i<=70e3; $i+=2e3) { 11 | push(@points_slct, $i); 12 | } 13 | # Ref chloroplast 14 | $max_allowed = 80e3; 15 | $min_allowed = 2e3; 16 | 17 | # For MP points 18 | #@points_slct = (); 19 | #for (my $i=30e3; $i<=50e3; $i+=2e3) { 20 | # push(@points_slct, $i); 21 | #} 22 | # Ref chloroplast 23 | #$max_allowed = 80e3; 24 | #$min_allowed = 2e3; 25 | #@points_slct = (100e3, 130e3, 160e3, 190e3, 220e3); 26 | 27 | 28 | 29 | 30 | while (<>) { 31 | chomp; 32 | my @ta = split(/\t/, $_); 33 | my $is_o = 0; 34 | for my $p (@points_slct) { 35 | if ( $ta[3] <= $p ) { 36 | $ta[3]+$ta[8] > $p and $ta[3]+$ta[8] < $max_allowed and $ta[3] > $min_allowed and do { $is_o = 1; last; }; 37 | } else { 38 | $ta[3]+$ta[8] < $p and $ta[3] < $max_allowed and $ta[3]+$ta[4] > $min_allowed and do { $is_o = 1; last; }; 39 | } 40 | } 41 | # $is_o == 1 and print "$_\n"; 42 | $is_o == 1 and print "$ta[8]\n"; 43 | } 44 | 45 | -------------------------------------------------------------------------------- /calc_est_in_psl.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t and !@ARGV and die "perl $0 est2genome.psl\n"; 6 | 7 | my @lvls = (0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 1); 8 | 9 | my %calc; 10 | while (<>) { 11 | chomp; 12 | my @ta = split(/\t/, $_); 13 | (defined $ta[0] and $ta[0] ne "" and $ta[0] =~ m/^(\d+)$/ and $ta[0] > 0) or next; 14 | my $cov_len = $ta[12]-$ta[11]; 15 | for my $r1 (@lvls) { 16 | $cov_len >= $ta[10] * $r1 and $calc{$r1}{$ta[9]}++; 17 | } 18 | } 19 | print STDOUT join("\t", qw/Olap_Ratio EST_Number/)."\n"; 20 | for my $r1 (@lvls) { 21 | my $num = scalar( keys %{$calc{$r1}} ); 22 | print STDOUT join("\t", $r1, $num)."\n"; 23 | } 24 | -------------------------------------------------------------------------------- /cmd_ctrl/log_func.sh: -------------------------------------------------------------------------------- 1 | ### Basic functions. 2 | function exe_cmd { 3 | echo "[$(date)][CMD] $1" 4 | eval "$1" 5 | if [[ $? -eq "0" ]] 6 | then 7 | echo "[$(date)][CMD_done] $1" 8 | else 9 | echo "[$(date)][CMD_err] $1" 10 | exit 1 11 | fi 12 | } 13 | 14 | function tsmsg { 15 | echo "[$(date)][Msg] $1" 16 | } 17 | 18 | 19 | -------------------------------------------------------------------------------- /cmd_ctrl/rm_list.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use fileSunhh; 5 | 6 | -t and !@ARGV and die "perl $0 filelist_toRM\n"; 7 | 8 | while (<>) { 9 | chomp; 10 | my @ta = &splitL("\t", $_); 11 | &fileSunhh::_rmtree($ta[0]); 12 | } 13 | 14 | -------------------------------------------------------------------------------- /cmd_ctrl/split_scrn_time.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t and !@ARGV and die "perl $0 scrn.t1 | less -S\n"; 6 | 7 | while (<>) { 8 | chomp; 9 | # Begin index search ...[Fri Jun 12 10:21:32 2020][CMD_done]gzip clean_files/lnc_GWRT_Rep2_*fq 10 | if ( s!^\[([^\[\]]+)\]\[([^\[\]]+)\]!! ) { 11 | my ($t1, $t2) = ($1, $2); 12 | s!^\s+!!; 13 | print join("\t", $t1, $t2, $_)."\n"; 14 | } elsif ( s!^.*\[(\S\S\S +\S\S\S +\d+ +\d+:\d+:\d+ +\d+)\]\[([^\[\]]+)\]!! ) { 15 | my ($t1, $t2) = ($1, $2); 16 | s!^\s+!!; 17 | print join("\t", $t1, $t2, $_)."\n"; 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /cmd_ctrl/wrap_sh.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | print STDOUT <) { 19 | if (m/^\s*#/ or m/^\s*$/) { 20 | print; 21 | } else { 22 | s/[^\t \S]+$//; 23 | print "exe_cmd \"$_\"\n"; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /deal_fasta.pl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/deal_fasta.pl -------------------------------------------------------------------------------- /deal_table.pl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/deal_table.pl -------------------------------------------------------------------------------- /enrich/example_data/in_geneID.list: -------------------------------------------------------------------------------- 1 | og24679 2 | og24447 3 | og23589 4 | og00306 5 | og18760 6 | og25399 7 | og26601 8 | og23340 9 | og21065 10 | og18067 11 | -------------------------------------------------------------------------------- /enrich/scripts/extend_IPRannot_for_IPRenrich.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | !@ARGV and die "perl $0 pub_IPR-entry.list in_gene2IPR.combined out_file\n"; 6 | 7 | 8 | my $ifnIPRentry = shift; 9 | my $ifhGene2IPR = shift; 10 | my $ofnEnrich = shift; 11 | 12 | my (%ipr2name); 13 | 14 | open F1,'<',"$ifnIPRentry" or die; 15 | while () { 16 | chomp; 17 | my @ta=split(/\t/, $_); 18 | $ta[0] eq 'ENTRY_AC' and next; 19 | $ipr2name{$ta[0]} = $ta[2]; 20 | } 21 | close F1; 22 | 23 | open F2,'<',"$ifhGene2IPR" or die; 24 | open O1,'>',"$ofnEnrich" or die; 25 | my (%hasOut, %notFound); 26 | while () { 27 | my @ta=split(/\t/, $_); 28 | chomp(@ta); 29 | if ($ta[1] eq "" or $ta[1] =~ m!^na$!i) { 30 | print O1 join("\t", $ta[0], "", "")."\n"; 31 | $hasOut{$ta[0]}{""} = 1; 32 | next; 33 | } 34 | for my $id1 (split(/;/, $ta[1])) { 35 | if (defined $ipr2name{$id1}) { 36 | defined $hasOut{$ta[0]}{$id1} and next; 37 | print O1 join("\t", $ta[0], $id1, $ipr2name{$id1})."\n"; 38 | $hasOut{$ta[0]}{$id1} = 1; 39 | } else { 40 | $notFound{$ta[0]}{$id1} = 1; 41 | } 42 | } 43 | } 44 | for my $gene1 (sort keys %notFound) { 45 | defined $hasOut{$gene1} and next; 46 | print O1 join("\t", $gene1, "", "")."\n"; 47 | } 48 | close O1; 49 | close F2; 50 | 51 | -------------------------------------------------------------------------------- /evolution_tools/SV_detection/cnvt_anchors_to_tbl.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | my %blk; 6 | while (<>) { 7 | chomp; 8 | m!^\s*(#|$)! and next; 9 | my @ta=split(/\t/, $_); 10 | $ta[1] eq 'referenceStart' and next; 11 | $blk{$ta[8]}{'rID'} //= $ta[0]; 12 | $blk{$ta[8]}{'qID'} //= $ta[3]; 13 | $blk{$ta[8]}{'str'} //= $ta[6]; 14 | $blk{$ta[8]}{'Rstart'} //= $ta[1]; $blk{$ta[8]}{'Rstart'} > $ta[1] and $blk{$ta[8]}{'Rstart'} = $ta[1]; 15 | $blk{$ta[8]}{'Rend'} //= $ta[2]; $blk{$ta[8]}{'Rend'} < $ta[2] and $blk{$ta[8]}{'Rend'} = $ta[2]; 16 | $blk{$ta[8]}{'Qstart'} //= $ta[4]; $blk{$ta[8]}{'Qstart'} > $ta[4] and $blk{$ta[8]}{'Qstart'} = $ta[4]; 17 | $blk{$ta[8]}{'Qend'} //= $ta[5]; $blk{$ta[8]}{'Qend'} < $ta[5] and $blk{$ta[8]}{'Qend'} = $ta[5]; 18 | } 19 | for (sort {$blk{$a}{'rID'} cmp $blk{$b}{'rID'} || $blk{$a}{'Rstart'} <=> $blk{$b}{'Rstart'} } keys %blk) { 20 | $blk{$_}{'lenQ'} = $blk{$_}{'Qend'} - $blk{$_}{'Qstart'} + 1; 21 | $blk{$_}{'lenR'} = $blk{$_}{'Rend'} - $blk{$_}{'Rstart'} + 1; 22 | print STDOUT join("\t", @{$blk{$_}}{qw/qID Qstart Qend str rID Rstart Rend lenQ lenR/}, , $_)."\n"; 23 | } 24 | -------------------------------------------------------------------------------- /evolution_tools/SV_detection/find_nonOvlCDS.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | use fileSunhh; 6 | 7 | !@ARGV and die "perl $0 s0.anchors.gff3 toAdd_CDS.gff3 out_prefix\n"; 8 | 9 | my $f1 = shift; 10 | my $f2 = shift; 11 | my $opre = shift; 12 | 13 | open F1,'<',"$f1" or die; 14 | my %loc1; 15 | while () { 16 | m!^\s*(#|$)! and next; 17 | chomp; 18 | my @ta=split(/\t/, $_); 19 | $ta[2] eq 'gene' or next; 20 | push(@{$loc1{$ta[0]}}, [@ta[3,4]]); 21 | } 22 | close F1; 23 | for (keys %loc1) { @{$loc1{$_}} = sort {$a->[0] <=> $b->[0] || $a->[1] <=> $b->[1]} @{$loc1{$_}}; } 24 | 25 | open F2,'-|',"perl /home/Sunhh/tools/github/NGS_data_processing/temp/deal_gff3.pl -inGff $f2 -getJnLoc " or die; 26 | &fileSunhh::write2file("${opre}.list","", '>'); 27 | while () { 28 | chomp; 29 | my @ta=split(/\t/, $_); 30 | $ta[0] eq 'mrnaID' and next; 31 | my $is_ovl = 0; 32 | for my $a1 (@{$loc1{$ta[2]}}) { 33 | $a1->[1] < $ta[6] and next; 34 | $a1->[0] > $ta[7] and last; 35 | $is_ovl = 1; 36 | last; 37 | } 38 | $is_ovl == 1 and next; 39 | &fileSunhh::write2file("${opre}.list", "$ta[0]\n",'>>'); 40 | } 41 | close F2; 42 | &runCmd("perl /home/Sunhh/tools/github/NGS_data_processing/temp/deal_gff3.pl -inGff $f2 -gffret ${opre}.list -idType mRNA > ${opre}.gff3"); 43 | 44 | -------------------------------------------------------------------------------- /evolution_tools/SV_detection/get_shrt_or_ident_mafTab.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | !@ARGV and die "perl $0 38000 pident_99 align2w38_anc_fix.maf.blasttab > align2w38_anc_fix.maf.blasttab.shrt\n"; 6 | 7 | my $maxLen = shift; 8 | my $minIdent = shift; 9 | 10 | while (<>) { 11 | chomp; 12 | my @ta=split(/\t/,$_); 13 | if (abs($ta[7]-$ta[6])+1 < $maxLen and abs($ta[9]-$ta[8])+1 < $maxLen) { 14 | print "$_\n"; 15 | } elsif ($ta[2] >= $minIdent) { 16 | print "$_\n"; 17 | } 18 | } 19 | 20 | # [Sunhh@panda with_allNewR2_ori]$ deal_table.pl align2w38_anc_fix.maf.blasttab -col_head 21 | # 0 22CEXU11_Chr06 22 | # 1 22CEXU43_Chr06 23 | # 2 100.00 24 | # 3 2385 25 | # 4 0 26 | # 5 0 27 | # 6 20755838 28 | # 7 20758222 29 | # 8 19857222 30 | # 9 19859606 31 | 32 | -------------------------------------------------------------------------------- /evolution_tools/SV_detection/if_needIDfix.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | !@ARGV and die "perl $0 in.sam > chk_tbl\n"; 6 | 7 | my $f=shift; 8 | open F,'<',"$f" or die; 9 | my $is=0; 10 | while () { 11 | m!^@! and next; 12 | chomp; 13 | my @ta=split(/\t/, $_); 14 | $ta[5] =~ m!^(\d+H)?\d+[ID]! and do {$is = 1; last;}; 15 | $ta[5] =~ m!\d+[ID](\d+H)?$! and do {$is = 1; last;}; 16 | } 17 | close F; 18 | print join("\t", $f, $is)."\n"; 19 | 20 | -------------------------------------------------------------------------------- /evolution_tools/SV_detection/nucdiff_modification/cmd_list: -------------------------------------------------------------------------------- 1 | # replace files in dir: ~/.local/lib/python2.7/site-packages/nucdiff/ 2 | 3 | -------------------------------------------------------------------------------- /evolution_tools/SV_detection/remove_gap_var.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | !@ARGV and die "perl $0 flank_len_100 C31.ndf_ref_snps.gff > C31.ndf_ref_snps.rmGap.gff\n"; 6 | 7 | my $flank_len = shift; 8 | 9 | my (@lines, %gaps); 10 | while (<>) { 11 | m!^\s*(#|$)! and next; 12 | chomp; 13 | my @ta=split(/\t/, $_); 14 | if ($ta[8] =~ m!Name=[^;\s]*(gap|ATGCN)!) { 15 | push(@{$gaps{$ta[0]}}, [@ta[3,4]]); 16 | } else { 17 | push(@lines, [@ta]); 18 | } 19 | } 20 | 21 | for my $cid (keys %gaps) { 22 | @{$gaps{$cid}} = sort { $a->[0] <=> $b->[0] || $a->[1] <=> $b->[1] } @{$gaps{$cid}}; 23 | } 24 | for my $l1 (@lines) { 25 | my $close2gap = 0; 26 | for my $seR (@{$gaps{$l1->[0]}}) { 27 | $seR->[1] < $l1->[3]-$flank_len and next; 28 | $seR->[0] > $l1->[4]+$flank_len and last; 29 | $close2gap = 1; 30 | last; 31 | } 32 | $close2gap == 1 and next; 33 | print STDOUT join("\t", @$l1)."\n"; 34 | } 35 | 36 | -------------------------------------------------------------------------------- /evolution_tools/SV_detection/rmNvar_inVCF.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | while (<>) { 6 | chomp; 7 | m!^#! and do { print "$_\n"; next; }; 8 | my @ta=split(/\t/, $_); 9 | $ta[3] =~ m![nN]! and next; 10 | $ta[4] ne '' and $ta[4] =~ m![nN]! and next; 11 | print "$_\n"; 12 | } 13 | -------------------------------------------------------------------------------- /evolution_tools/SV_detection/rm_0span_maf.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use fileSunhh; 5 | 6 | !@ARGV and die "perl $0 align2w38_anc_fix_jn.maf > fixed.maf\n"; 7 | 8 | while (<>) { 9 | chomp; 10 | if (m!^\s*#!) { 11 | print STDOUT "$_\n"; 12 | next; 13 | } 14 | my $l2=<>; chomp($l2); 15 | my $l3=<>; chomp($l3); 16 | my $l4=<>; chomp($l4); 17 | m!^a\s! or die "bad2:$_\n"; 18 | $l2 =~ m!^s\s+(\S+)\s+(\d+)\s+(\d+)\s+([+-])\s! or die "bad3:$l2\n"; 19 | $3 == 0 and next; 20 | $l3 =~ m!^s\s+(\S+)\s+(\d+)\s+(\d+)\s+([+-])\s! or die "bad4:$l3\n"; 21 | $3 == 0 and next; 22 | print "$_\n$l2\n$l3\n$l4\n"; 23 | } 24 | 25 | -------------------------------------------------------------------------------- /evolution_tools/SV_detection/rmdup_fromNormVcf.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # [3/9/2023] Only applicable after running 'bcftools norm -m-both | bcftools norm -d none --fasta-ref xx.fa | bcftools sort '; 3 | use strict; 4 | use warnings; 5 | 6 | -t and !@ARGV and die "perl $0 normSrt_dup.vcf > dedup.vcf\n"; 7 | 8 | my $prevLine = ''; 9 | my ($prevID, $prevPos, $prevLen) = ("", -1, -1); 10 | while (<>) { 11 | m!^\s*(#|$)! and do {print; next;}; 12 | chomp; 13 | my @ta=split(/\t/, $_); 14 | my $currLen = abs(length($ta[3])-length($ta[4])); 15 | $ta[4] eq '' and $currLen = 1e9; 16 | if ($ta[0] eq $prevID and $ta[1] == $prevPos) { 17 | # dup 18 | if ($currLen > $prevLen) { 19 | $prevLine = $_; 20 | $prevLen = $currLen; 21 | } 22 | } else { 23 | $prevLine ne '' and print STDOUT "$prevLine\n"; 24 | $prevLine = $_; 25 | $prevID = $ta[0]; 26 | $prevPos = $ta[1]; 27 | $prevLen = $currLen; 28 | } 29 | } 30 | if ($prevLine ne '') { 31 | print STDOUT "$prevLine\n"; 32 | $prevLine = ''; 33 | } 34 | 35 | -------------------------------------------------------------------------------- /evolution_tools/SV_detection/run_mm2paftool.sh: -------------------------------------------------------------------------------- 1 | ### Basic functions. 2 | function exe_cmd { 3 | echo "[$(date)][CMD] $1" 4 | eval "$1" 5 | if [[ $? -eq "0" ]] 6 | then 7 | echo "[$(date)][CMD_done] $1" 8 | else 9 | echo "[$(date)][CMD_err] $1" 10 | fi 11 | } 12 | 13 | function tsmsg { 14 | echo "[$(date)][Msg] $1" 15 | } 16 | 17 | outDir="by_minimap2_paftools/output/" 18 | refFaFn=/data/Sunhh/cpepo/analysis/SV_detection/in_genomes/final_C39.chr.fa 19 | qryDir=/data/Sunhh/cpepo/analysis/SV_detection/in_genomes/ 20 | [[ -e $outDir ]] || mkdir -p $outDir 21 | cd $outDir/ 22 | for cid in C31 C38 23 | do 24 | exe_cmd "minimap2 -cx asm20 -t 20 --cs $refFaFn $qryDir/final_${cid}.chr.fa > ${cid}.asm20.paf" 25 | exe_cmd "sort -k6,6 -k8,8n ${cid}.asm20.paf | paftools.js call -L 10000 -s $cid -f $refFaFn - > ${cid}.asm20.vcf" 26 | done 27 | cd - 28 | 29 | -------------------------------------------------------------------------------- /evolution_tools/SV_detection/run_ndf.sh: -------------------------------------------------------------------------------- 1 | ### Basic functions. 2 | function exe_cmd { 3 | echo "[$(date)][CMD] $1" 4 | eval "$1" 5 | if [[ $? -eq "0" ]] 6 | then 7 | echo "[$(date)][CMD_done] $1" 8 | else 9 | echo "[$(date)][CMD_err] $1" 10 | fi 11 | } 12 | 13 | function tsmsg { 14 | echo "[$(date)][Msg] $1" 15 | } 16 | 17 | 18 | export PATH="/data/Sunhh/src/Align/mummer/install/mummer4/bin/:$PATH" 19 | refFaFn=/data/Sunhh/cpepo/analysis/SV_detection/in_genomes/final_C39.chr.fa 20 | qryDir=/data/Sunhh/cpepo/analysis/SV_detection/in_genomes/ 21 | outDir="by_nucdiff/output/" 22 | [[ -e $outDir ]] || mkdir -p $outDir 23 | cd $outDir/ 24 | for cid in C31 C38 25 | do 26 | exe_cmd "nucdiff --proc 10 --nucmer_opt ' -t 20 --batch 1 ' $refFaFn $qryDir/final_${cid}.chr.fa ./ ${cid}.ndf" 27 | done 28 | cd - 29 | 30 | -------------------------------------------------------------------------------- /evolution_tools/SV_detection/select_var.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | !@ARGV and die "perl $0 min_var_len_20 C38.ndf_ref_struct.rmGap.effG > C38.ndf_ref_struct.rmGap.effG.slct1\n"; 6 | 7 | my $min_var_len = shift; # 20 8 | 9 | while (<>) { 10 | m!^\s(#|$)! and next; 11 | chomp; 12 | my @ta=split(/\t/, $_); 13 | m!\tVAR_annot|Name=(insertion|duplication|tandem_duplication|unaligned_end|unaligned_beginning|deletion|collapsed_repeat|collapsed_tandem_repeat);! or next; 14 | if (m!(?:overlap|subst|del|ins|blk)_len=(\d+)!){ 15 | $1 >= $min_var_len or next; 16 | } 17 | print STDOUT "$_\n"; 18 | } 19 | 20 | -------------------------------------------------------------------------------- /evolution_tools/SV_detection/view_anchors.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | 6 | while (<>) { 7 | chomp; 8 | m!^\s*(#|$)! and next; 9 | my @ta=split(/\t/, $_); 10 | if ($ta[0] eq 'refChr') { 11 | print join("\t", qw/refSpan qrySpan/, @ta)."\n"; 12 | next; 13 | } 14 | my $lenR = $ta[2]-$ta[1]+1; 15 | my $lenQ = $ta[5]-$ta[4]+1; 16 | print join("\t", $lenR, $lenQ, @ta)."\n"; 17 | } 18 | 19 | -------------------------------------------------------------------------------- /evolution_tools/compare_assemblies/byMUMmer/stat2_chk_gapCover.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | # ==> ngs.nlis <== 6 | # Key Length MatchStart MatchEnd MatchLen 7 | # WM97_scaffold11 169061 3288 3336 49 8 | # WM97_scaffold11 169061 14316 14502 187 9 | # WM97_scaffold11 169061 58876 58908 33 10 | # 11 | # ==> manual_alignments.txt.stat <== 12 | # ID1 WM97pbV0_000000F 16657 113511 96855 13 | # ID1 WM97pbV0_000000F 128093 139235 11143 14 | # ID1 WM97pbV0_000000F 181677 203420 21744 15 | # ID1 WM97pbV0_000000F 207866 208637 772 16 | 17 | !@ARGV and die "perl $0 ngs.nlis manual_alignments.txt.stat > ngs.nlis.coverTag\n"; 18 | my $nlis = shift; 19 | my $stat = shift; 20 | 21 | open F2,'<',"$stat" or die; 22 | my %covered; 23 | while () { 24 | chomp; 25 | my @ta = split(/\t/, $_); 26 | $ta[0] eq 'ID1' and next; 27 | push(@{$covered{$ta[1]}}, [@ta[2,3]]); 28 | } 29 | close F2; 30 | 31 | open F1,'<',"$nlis" or die; 32 | while () { 33 | chomp; 34 | my @ta = split(/\t/, $_); 35 | if ($ta[0] eq 'Key') { 36 | print join("\t", @ta, "Covered")."\n"; 37 | next; 38 | } 39 | my $is_cover = 0; 40 | for my $t1 (@{$covered{$ta[0]}}) { 41 | $t1->[0] <= $ta[2] and $t1->[1] >= $ta[3] and do { $is_cover = 1; last; }; 42 | } 43 | print join("\t", @ta, $is_cover)."\n"; 44 | } 45 | close F1; 46 | 47 | 48 | -------------------------------------------------------------------------------- /evolution_tools/compare_assemblies/mcscanTab_to_dupTxt.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t and !@ARGV and die "cat 02_ma_mo_byScf/ma_mo_byChr.coll.ks.tab.mo_mo | perl $0 > data/dup.txt\n"; 6 | 7 | while (<>) { 8 | chomp; 9 | my @ta=split(/\t/, $_); 10 | @ta = @ta[1,2,3,4,5,6,7]; 11 | $ta[0] eq 'Chrom1' and next; 12 | $ta[1]>$ta[2] and @ta[1,2]=@ta[2,1]; 13 | $ta[4]>$ta[5] and @ta[4,5]=@ta[5,4]; 14 | $ta[6] eq "-" and @ta[5,4]=@ta[4,5]; 15 | $ta[1]--; 16 | $ta[4]--; 17 | print join("\t", @ta[0..5] )."\n"; 18 | } 19 | 20 | -------------------------------------------------------------------------------- /evolution_tools/copy_number_var/README.md: -------------------------------------------------------------------------------- 1 | # Gene CNV analysis. 2 | 3 | ## Test by FET. 4 | - Prepare orthologous group count file. (`synFam.cnt`). First line shows accession names. 5 | - Prepare two-column meta file projecting accessions to populations. (`map.acc_pop`) 6 | - Test presence frequency change between two populations. 7 | 8 | ```sh 9 | Rscript compare_gene_expansion.r -a synFam.cnt -b map.acc_pop -f landrace -t cultivar -o landrace_to_cultivar.CNV.tbl 10 | Rscript compare_gene_expansion.r -a synFam.cnt -b map.acc_pop -f cordophanus -t landrace -o cordophanus_to_landrace.CNV.tbl 11 | ``` 12 | 13 | ## Combine test results. 14 | ```sh 15 | echo -e "landrace_to_cultivar.CNV.tbl\land_cult" > meta.comparison_name 16 | echo -e "cordophanus_to_landrace.CNV.tbl\tCLC_land" >> meta.comparison_name 17 | 18 | perl combine_DEGs.pl meta.comparison_name 1 > combined.CNV.tbl 19 | ``` 20 | 21 | ## Identify gene families with truely expansion enriched in one group. 22 | - Those gene families with contraction enriched in the other group should be removed. 23 | - For example, expanded in `CLV` instead of contracted in `CA`. 24 | 25 | ```sh 26 | perl get_CLV_expansion.pl -final_label CLV_high -expan_label CLV_high CA_to_CLV.tbl > CA_to_CLV-CLV_expanded.tbl 27 | ``` 28 | 29 | -------------------------------------------------------------------------------- /evolution_tools/expansion_tools/01.clean_nwk.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use Text::Balanced qw( extract_bracketed ); 5 | use LogInforSunhh; 6 | 7 | # http://www.perlmonks.org/?node_id=547596 8 | 9 | while (<>) { 10 | chomp; 11 | my $n = &rmStatNum($_); 12 | my $m = &fmtBranch($n); 13 | print "$m\n"; 14 | } 15 | 16 | # (((Sly:0.11746590,Vvi:0.07461103)0.7300:0.02307856,Ath:0.14512370)1.0000:0.03408003,(SpiOl:0.05289341,Bvu:0.04303894)1.0000:0.05605601); 17 | 18 | sub rmStatNum { 19 | while ($_[0] =~ s/\)\s*[\d.]+/)/g) { 20 | # while ($_[0] =~ s/:[\d.]+//g or $_[0] =~ s/\)\s*[\d.]+/)/g) { 21 | } 22 | return $_[0]; 23 | } 24 | sub fmtBranch { 25 | while ($_[0] =~ s/:(\d+\.\d+)/":" . int($1*100)/eg) { 26 | } 27 | return $_[0]; 28 | } 29 | -------------------------------------------------------------------------------- /evolution_tools/expansion_tools/01.prepare_ortho_to_tbl.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | use Getopt::Long; 6 | my %opts; 7 | GetOptions(\%opts, 8 | "help!", 9 | ); 10 | 11 | my $help_txt = < all_orthomcl.out.tab 14 | 15 | HH 16 | 17 | $opts{'help'} and &LogInforSunhh::usage($help_txt); 18 | -t and !@ARGV and &LogInforSunhh::usage($help_txt); 19 | 20 | print STDOUT join("\t", qw/OrthoGrpID TaxID GeneID/)."\n"; 21 | 22 | while (<>) { 23 | chomp; 24 | my @ta = split(/\t/, $_); 25 | my @tb = split(/\s+/, $ta[1]); 26 | my $grpID = &grpID( $ta[0] ); 27 | for my $tc (@tb) { 28 | $tc =~ m/^\s*$/ and next; 29 | $tc =~ m/^(\S+)\((\S+)\)$/ or &stopErr( "tc=[$tc]\n" ); 30 | my ($gid, $taxID) = ($1, $2); 31 | print STDOUT join("\t", $grpID, $taxID, $gid)."\n"; 32 | } 33 | } 34 | 35 | sub grpID { 36 | my $back = $_[0]; 37 | if ($back =~ m/^(\S+)\s*\(\s*\d+\s+genes?\s*,\s*\d+\s*taxa\s*\)/) { 38 | $back = $1 39 | } else { 40 | $back =~ s/\s/_/g; 41 | } 42 | return $back; 43 | } 44 | -------------------------------------------------------------------------------- /evolution_tools/expansion_tools/jn_gene_byIPR.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | use fileSunhh; 6 | 7 | !@ARGV and die "perl $0 in.iprV5.tsv\n"; 8 | 9 | my $fn = shift; 10 | open F,'<',"$fn" or die; 11 | my %h; 12 | while (&wantLineC(\*F)) { 13 | my @ta = &splitL("\t", $_); 14 | ( defined $ta[11] and $ta[11] ne '' ) or next; 15 | my $tk = "$ta[11] ($ta[12])"; 16 | $h{$tk}{$ta[0]} //= $.; 17 | } 18 | close F; 19 | print STDOUT join("\t", 'IPRID', $fn)."\n"; 20 | for my $iprK ( sort keys %h ) { 21 | my @tb = sort { $a cmp $b } keys %{$h{$iprK}}; 22 | print STDOUT join("\t", $iprK, scalar(@tb) . ":" . join(',', @tb))."\n"; 23 | } 24 | 25 | -------------------------------------------------------------------------------- /evolution_tools/ortho_tools/mk_sep_blastp_shell.sh: -------------------------------------------------------------------------------- 1 | inFa=$1 2 | dbFa=$2 3 | cutN=$3 4 | pref=$4 5 | 6 | if [ -n "$pref" ] 7 | then 8 | deal_fasta.pl $inFa -cut $cutN -cut_prefix $pref 9 | ls ${pref}_cutted/*.fasta | perl -e ' while (<>) { chomp; print "blastp -query $_ -out $_.blast -db all.fa -evalue 1e-5 -max_target_seqs 1000 -outfmt 6 -num_threads 3 \n"; } ' > cmd_list_blastp_${pref} 10 | echo "CMD: nohup run_cmd_in_batch.pl -cpuN 60 cmd_list_blastp_${pref} > scrn.sep_${pref}" 11 | else 12 | echo "bash $0 in.fa db_name cut_num cut_pref" 13 | echo "cat ${pref}_cutted/*.blast > all.blast" 14 | exit 15 | fi 16 | 17 | -------------------------------------------------------------------------------- /evolution_tools/ortho_tools/replace_all_blast_file.sh: -------------------------------------------------------------------------------- 1 | pid_toKill=$1 2 | file_toRM=$2 3 | file_toUse=$3 4 | if [ -n "$file_toUse" ] 5 | then 6 | rm $file_toRM 7 | chmod a-w $file_toUse 8 | cp -p $file_toUse $file_toRM 9 | kill -9 $pid_toKill 10 | else 11 | echo "" 12 | echo "bash $0 PID_toKill File_toRM File_toUse" 13 | echo "" 14 | exit; 15 | fi 16 | 17 | -------------------------------------------------------------------------------- /evolution_tools/structure/get_time_from_structScrn.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t and !@ARGV and die "perl $0 scrn.run_struct | less -S\n"; 6 | 7 | my @aa; 8 | while (<>) { 9 | m!CMD! or next; 10 | chomp; 11 | s!structure!\tstructure!; 12 | s!_(\d+)$!_$1\t$1!; 13 | my @ta = split(/\t/, $_); 14 | defined $ta[2] or die "bad line: $_\n"; 15 | push(@aa, [@ta]); 16 | } 17 | for (sort { $a->[2] <=> $b->[2] || $a->[2] cmp $b->[2] } @aa){ 18 | print join("\t", @$_)."\n"; 19 | } 20 | 21 | -------------------------------------------------------------------------------- /evolution_tools/structure/mv_result_files.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | -t and !@ARGV and die "perl $0 in_result_list\n"; 5 | 6 | -d "all_results" or mkdir("all_results/"); 7 | my %h; 8 | my $suff; 9 | while (<>) { 10 | chomp; 11 | m!^\./structure_(\d+)/structure_(K\d+)! or die "$_\n"; 12 | $suff = "${1}_${2}"; 13 | my $bn=$_; 14 | $bn=~s!^.+/!!; 15 | my $new_f = "${suff}_${bn}"; 16 | defined $h{$new_f} and die "$new_f\n"; $h{$new_f} = 1; 17 | system "cp -p $_ all_results/$new_f"; 18 | } 19 | 20 | -------------------------------------------------------------------------------- /evolution_tools/structure/order_ClumppIndFileOut_byIndID.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | !@ARGV and die "perl $0 order_list ClumppIndFile.output > ClumppIndFile.output.srt\n"; 6 | 7 | my $f_order = shift; 8 | my @id_ord; 9 | my %id; 10 | open F,'<',"$f_order" or die; 11 | while () { 12 | chomp; 13 | my @ta= split(/\t/, $_); 14 | push(@id_ord, $ta[0]); 15 | defined $id{$ta[0]} and die "repeated ID [$ta[0]]\n"; 16 | $id{$ta[0]} = $.; 17 | } 18 | close F; 19 | 20 | my %lines; 21 | while (<>) { 22 | chomp; 23 | my $raw_line = $_; 24 | m!^\s*(\S+)! or die "$_\n"; 25 | my $ii = $1; 26 | defined $id{$ii} or do { warn "Skip missing ID [$ii]: $_\n"; next; }; 27 | defined $lines{$ii} and die "repeat ID line [$_]\n"; 28 | $lines{$ii} = $raw_line; 29 | } 30 | for (@id_ord) { 31 | defined $lines{$_} or do { warn "Skip bad ID [$_]\n"; next; }; 32 | print "$lines{$_}\n"; 33 | } 34 | 35 | # 1 1 (0) 1 : 0.0000 1.0000 36 | # 2 2 (2) 1 : 0.0000 1.0000 37 | # 3 3 (0) 1 : 0.0000 1.0000 38 | # 4 4 (0) 1 : 0.0000 1.0000 39 | # 5 5 (0) 1 : 0.0000 1.0000 40 | # 6 6 (0) 1 : 0.0000 1.0000 41 | 42 | 43 | -------------------------------------------------------------------------------- /evolution_tools/structure/shrt_col0.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | use Getopt::Long; 6 | my %opts; 7 | GetOptions(\%opts, 8 | "help!", 9 | "shrt_len:i", # 9 10 | "shrt_col:i", # 0 11 | ); 12 | 13 | $opts{'shrt_len'} //= 9; 14 | $opts{'shrt_col'} //= 0; 15 | 16 | my $help_txt = <) { 32 | chomp; 33 | my @ta = split(/\t/, $_); 34 | $ta[ $opts{'shrt_col'} ] = substr($ta[ $opts{'shrt_col'} ], 0, $opts{'shrt_len'}); 35 | my $tk = $ta[ $opts{'shrt_col'} ]; 36 | my $suff = "a"; 37 | while (defined $h{$tk}) { 38 | $suff++; 39 | $tk = "$ta[$opts{'shrt_col'}]$suff"; 40 | } 41 | $h{$tk} = 1; 42 | $ta[ $opts{'shrt_col'} ] = $tk; 43 | print join("\t", @ta)."\n"; 44 | } 45 | -------------------------------------------------------------------------------- /evolution_tools/structure/structure: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/evolution_tools/structure/structure -------------------------------------------------------------------------------- /evolution_tools/vcf_tab/add_ref_as_indv_in_vcfTab.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use Getopt::Long; 5 | my %opts; 6 | GetOptions(\%opts, 7 | "ref2id:s", 8 | ); 9 | ; 10 | 11 | -t and !@ARGV and die "perl $0 [-ref2id REF] in.vcfTab > out.vcfTab\n"; 12 | 13 | 14 | while (<>) { 15 | chomp; 16 | s!^(\S+\t\S+)\t(\S+)\t!! or die "bad line: $_\n"; 17 | if ($. == 1) { 18 | $opts{'ref2id'} //= $2; 19 | print STDOUT "$1\t$2\t$opts{'ref2id'}\t$_\n"; 20 | } else { 21 | print STDOUT "$1\t$2\t$2/$2\t$_\n"; 22 | } 23 | } 24 | 25 | -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/adaptors.fa: -------------------------------------------------------------------------------- 1 | >junc_seq_01 illinoi 2 | CTGTCTCTTATACACATCT 3 | >junc_seq_01:RC1-19 illinoi 4 | AGATGTGTATAAGAGACAG 5 | 6 | >junc_seq_02 Silin-P1P33-MP2k Silin-P1-MP5k 7 | GGTCGATAACTTCGTATAATGTATGCTATACGAAGTTATACA 8 | >junc_seq_02:RC1-42 Silin-P1P33-MP2k Silin-P1P3-MP5k 9 | TGTATAACTTCGTATAGCATACATTATACGAAGTTATCGACC 10 | 11 | >junc_seq_03 P1P3-ec5k 12 | CGTATAACTTCGTATAATGTATGCTATACGAAGTTATACA 13 | >junc_seq_03:RC1-40 P1P3-cre5kec5k 14 | TGTATAACTTCGTATAGCATACATTATACGAAGTTATACG 15 | 16 | >junc_seq_04 P3-cre5k 17 | ATAACTTCGTATAATGTATGCTATACGAAGTTATACA 18 | >junc_seq_04:RC1-43 P3-cre5k 19 | TGTATAACTTCGTATAGCATACATTATACGAAGTTAT 20 | 21 | >pe_seq_R1_p1 22 | AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC 23 | >pe_seq_R1_p2 24 | ATCTCGTATGCCGTCTTCTGCTTG 25 | >pe_seq_R2 26 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT 27 | >pe_seq_R1_p1:RC1-34 28 | GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 29 | >pe_seq_R1_p2:RC1-24 30 | CAAGCAGAAGACGGCATACGAGAT 31 | >pe_seq_R2:RC1-58 32 | AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT 33 | 34 | -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/cleanPE_byTrimmo.sh: -------------------------------------------------------------------------------- 1 | function exe_cmd { 2 | echo "[$(date)][CMD] $1" 3 | eval "$1" 4 | echo "[$(date)][Rec] Done." 5 | } 6 | 7 | function tsmsg { 8 | echo "[$(date)]$1" 9 | } 10 | 11 | exe_java="java" 12 | exe_jar="/data/Sunhh/src/Assemble/Trimmomatic/Trimmomatic-0.32/trimmomatic-0.32.jar" 13 | 14 | cpuN=30 15 | adp_fas="/data/Sunhh/src/Assemble/Trimmomatic/Trimmomatic-0.32/adapters/TruSeq3-PE-2.fa" 16 | minLen=25 17 | 18 | para_jar="-threads $cpuN" 19 | para_PE="ILLUMINACLIP:$adp_fas:2:30:10:1:TRUE SLIDINGWINDOW:4:20 LEADING:3 TRAILING:3 MINLEN:$minLen" 20 | 21 | tsmsg "[Rec] All start." 22 | for inPref in HKC_15_20kb HKC_8_10kb HWB_15_20kb HWB_8_10kb 23 | do 24 | tsmsg "[Rec] Dealing with $inPref" 25 | inFq1="${inPref}.p1" 26 | inFq2="${inPref}.p2" 27 | oPref="${inPref}" 28 | logFile="log.${oPref}" 29 | # para_jarAdd="-trimlog $logFile" 30 | para_jarAdd="" 31 | cmd="$exe_java -jar $exe_jar PE $para_jar $para_jarAdd $inFq1 $inFq2 ${oPref}_pTr_R1.fq ${oPref}_sTr_R1.fq ${oPref}_pTr_R2.fq ${oPref}_sTr_R2.fq $para_PE" 32 | exe_cmd "$cmd" 33 | done 34 | 35 | tsmsg "[Rec] All done." 36 | 37 | 38 | -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/cleanSE_byTrimmo.sh: -------------------------------------------------------------------------------- 1 | function exe_cmd { 2 | echo "[$(date)][CMD] $1" 3 | eval $1 4 | echo "[$(date)][Rec] Done." 5 | } 6 | 7 | function tsmsg { 8 | echo "[$(date)]$1" 9 | } 10 | 11 | exe_java="java" 12 | exe_jar="/data/Sunhh/src/Assemble/Trimmomatic/Trimmomatic-0.32/trimmomatic-0.32.jar" 13 | 14 | cpuN=30 15 | minLen=25 16 | 17 | para_jar="-threads $cpuN" 18 | para_PE="ILLUMINACLIP:$adp_fas:2:30:10:1 SLIDINGWINDOW:4:20 LEADING:3 TRAILING:3 MINLEN:$minLen" 19 | adp_fas="/data/Sunhh/src/Assemble/Trimmomatic/Trimmomatic-0.32/adapters/TruSeq3-PE-2.fa" 20 | 21 | para_SE="ILLUMINACLIP:$adp_fas:2:30:10 SLIDINGWINDOW:4:20 LEADING:3 TRAILING:3 MINLEN:$minLen" 22 | adp_fas="/data/Sunhh/src/Assemble/Trimmomatic/Trimmomatic-0.32/adapters/TruSeq3-SE.fa" 23 | 24 | tsmsg "[Rec] All start." 25 | for inPref in HKC_15_20kb HKC_8_10kb HWB_15_20kb HWB_8_10kb 26 | do 27 | tsmsg "[Rec] Dealing with $inPref" 28 | inFq1="${inPref}.p1" 29 | inFq2="${inPref}.p2" 30 | oPref="${inPref}" 31 | logFile="log.${oPref}" 32 | # para_jarAdd="-trimlog $logFile" 33 | para_jarAdd="" 34 | cmd="$exe_java -jar $exe_jar SE $para_jar $para_jarAdd $inFq1 $inFq2 ${oPref}_pTr_R1.fq ${oPref}_sTr_R1.fq ${oPref}_pTr_R2.fq ${oPref}_sTr_R2.fq $para_SE" 35 | exe_cmd "$cmd" 36 | done 37 | 38 | tsmsg "[Rec] All done." 39 | 40 | 41 | -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/illumina_adapters.fa: -------------------------------------------------------------------------------- 1 | >multiplexing-forward 2 | GATCGGAAGAGCACACGTCT 3 | >solexa-forward 4 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT 5 | >truseq-forward-contam 6 | AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC 7 | >truseq-reverse-contam 8 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA 9 | >nextera-forward-read-contam 10 | CTGTCTCTTATACACATCTCCGAGCCCACGAGAC 11 | >nextera-reverse-read-contam 12 | CTGTCTCTTATACACATCTGACGCTGCCGACGA 13 | >solexa-reverse 14 | AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG 15 | -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/polyAT_adp.fa: -------------------------------------------------------------------------------- 1 | >polyA 2 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 3 | >polyT 4 | TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT 5 | -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/run_ndupB.sh: -------------------------------------------------------------------------------- 1 | pl_dropB="$HOME/tools/github/NGS_data_processing/drop_dup_both_end.pl" 2 | 3 | subseqL=0 4 | subseqS=0 5 | 6 | # for inPre in NSP306_3hb NSP306_5hb NSP306_1kb NSP306_5kb NSP306_10kb NSP306_15kb 7 | for inPre in NSP306_3hbr2 8 | do 9 | outPre=$inPre 10 | inFq1="${inPre}_R1.fastq.gz" 11 | inFq2="${inPre}_R2.fastq.gz" 12 | cmd="perl $pl_dropB -opre $outPre $inFq1 $inFq2 -subseq $subseqL -subseqS $subseqS -rcDup" 13 | echo "[Rec][$(date)] $cmd" 14 | eval $cmd 15 | done 16 | echo "[Rec][$(date)] Finished." 17 | 18 | # drop_dup_both_end.pl -opre CG_MiSeq02 CG_MiSeq02_R1.fastq.gz CG_MiSeq02_R2.fastq.gz -subseq 100 19 | 20 | -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/run_rmRRNA.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | 6 | -t and !@ARGV and die "perl $0 inLis\n"; 7 | 8 | my $dbRRNA = '/share1/db_bowtie/rRNA_silva111'; 9 | my $cpuN = 10; 10 | 11 | my $pl_extractFq = '/home/Sunhh/tools/github/NGS_data_processing/extract_fq_by_list.pl'; 12 | 13 | 14 | while (<>) { 15 | chomp; 16 | my @ta = split(/\t/, $_); 17 | my $pref = "$ta[0]"; 18 | my $fqFile = "${pref}_highQ.single"; 19 | my $oFqFile="${pref}_rmRRNA.fq"; 20 | &tsmsg("[Rec] Dealing with [$fqFile]\n"); 21 | &exeCmd("bowtie -v 3 -k 1 -S -p $cpuN $dbRRNA $fqFile | samtools view -S -F 4 -hb -o $fqFile.bam -"); 22 | &exeCmd("samtools view $fqFile.bam | cut -f 1 > $fqFile.bam.rd"); 23 | &exeCmd("perl $pl_extractFq -mode drop -rdKey -refLis $fqFile.bam.rd -srcFq $fqFile -outFq $oFqFile"); 24 | } 25 | &tsmsg("[Rec] All done.\n"); 26 | -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/run_trimmoSE.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | 6 | !@ARGV and die "perl $0 in_RS.fq out_RS.fq\n"; 7 | 8 | my $inRSFq = shift; 9 | my $outRSFq = shift; 10 | 11 | my $exe_java = '/share/nas2/xigua/sunhonghe/src/java/jre1.8.0_131/bin/java'; 12 | my $jar_trim = '/share/nas2/xigua/sunhonghe/src/reads/trimmo/Trimmomatic-0.36/trimmomatic-0.36.jar'; 13 | my $dir_trim = '/data/Sunhh/src/general/trimmomatic/Trimmomatic-0.38/'; 14 | my $fn_adp = "$dir_trim/adapters/TruSeq3-SE.fa"; 15 | my $para_trim = " ILLUMINACLIP:${fn_adp}\:2:30:10 SLIDINGWINDOW:4:20 LEADING:3 TRAILING:3 MINLEN:40 "; 16 | 17 | $exe_java = '/usr/lib/jvm/java-11-openjdk-amd64/bin/java'; 18 | $jar_trim = '/data/Sunhh/src/general/trimmomatic/Trimmomatic-0.38/trimmomatic-0.38.jar'; 19 | $dir_trim = '/data/Sunhh/src/general/trimmomatic/Trimmomatic-0.38/'; 20 | $fn_adp = "$dir_trim/adapters/TruSeq3-SE.fa"; 21 | $para_trim = " ILLUMINACLIP:${fn_adp}\:2:30:10 SLIDINGWINDOW:4:20 LEADING:3 TRAILING:3 MINLEN:40 "; 22 | 23 | my $cmd = ""; 24 | $cmd .= "$exe_java "; 25 | $cmd .= " -jar $jar_trim "; 26 | $cmd .= " SE "; 27 | $cmd .= " -threads 2 "; 28 | $cmd .= " $inRSFq $outRSFq "; 29 | $cmd .= " $para_trim "; 30 | 31 | &exeCmd_1cmd($cmd); 32 | &exeCmd_1cmd("gzip $outRSFq"); 33 | 34 | 35 | -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/AUTHORS.jbzip2: -------------------------------------------------------------------------------- 1 | Matthew J. Francis 2 | -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/LICENCE.jbzip2: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010 Matthew J. Francis and Contributors of the jbzip2 Project 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/META-INF/MANIFEST.MF: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | Ant-Version: Apache Ant 1.8.4 3 | Created-By: 1.7.0_40-mockbuild_2013_10_02_16_56-b00 (Oracle Corporatio 4 | n) 5 | Main-Class: org.usadellab.trimmomatic.Trimmomatic 6 | 7 | -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2BlockCompressor.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2BlockCompressor.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2BlockDecompressor.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2BlockDecompressor.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2Constants.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2Constants.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2DivSufSort$PartitionResult.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2DivSufSort$PartitionResult.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2DivSufSort$StackEntry.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2DivSufSort$StackEntry.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2DivSufSort$TRBudget.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2DivSufSort$TRBudget.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2DivSufSort.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2DivSufSort.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2HuffmanStageDecoder.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2HuffmanStageDecoder.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2HuffmanStageEncoder.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2HuffmanStageEncoder.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2InputStream.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2InputStream.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2OutputStream.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BZip2OutputStream.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BitInputStream.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BitInputStream.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BitOutputStream.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/BitOutputStream.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/CRC32.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/CRC32.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/HuffmanAllocator.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/HuffmanAllocator.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/MoveToFront.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/itadaki/bzip2/MoveToFront.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/Pairomatic.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/Pairomatic.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/TrimStats.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/TrimStats.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/Trimmomatic.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/Trimmomatic.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/TrimmomaticPE.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/TrimmomaticPE.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/TrimmomaticSE.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/TrimmomaticSE.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/fasta/FastaParser.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/fasta/FastaParser.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/fasta/FastaRecord.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/fasta/FastaRecord.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/fasta/FastaSerializer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/fasta/FastaSerializer.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/fastq/FastqParser.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/fastq/FastqParser.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/fastq/FastqRecord.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/fastq/FastqRecord.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/fastq/FastqSerializer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/fastq/FastqSerializer.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/threading/BlockOfRecords.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/threading/BlockOfRecords.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/threading/BlockOfWork.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/threading/BlockOfWork.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/threading/ParserWorker.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/threading/ParserWorker.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/threading/SerializerWorker.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/threading/SerializerWorker.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/threading/TrimLogRecord.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/threading/TrimLogRecord.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/threading/TrimLogWorker.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/threading/TrimLogWorker.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/threading/TrimStatsWorker.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/threading/TrimStatsWorker.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/AbstractSingleRecordTrimmer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/AbstractSingleRecordTrimmer.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/AvgQualTrimmer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/AvgQualTrimmer.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/BarcodeSplitter.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/BarcodeSplitter.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/CropTrimmer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/CropTrimmer.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/HeadCropTrimmer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/HeadCropTrimmer.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/IlluminaClippingTrimmer$1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/IlluminaClippingTrimmer$1.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/IlluminaClippingTrimmer$IlluminaClippingSeq.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/IlluminaClippingTrimmer$IlluminaClippingSeq.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/IlluminaClippingTrimmer$IlluminaLongClippingSeq.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/IlluminaClippingTrimmer$IlluminaLongClippingSeq.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/IlluminaClippingTrimmer$IlluminaMediumClippingSeq.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/IlluminaClippingTrimmer$IlluminaMediumClippingSeq.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/IlluminaClippingTrimmer$IlluminaPrefixPair.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/IlluminaClippingTrimmer$IlluminaPrefixPair.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/IlluminaClippingTrimmer$IlluminaShortClippingSeq.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/IlluminaClippingTrimmer$IlluminaShortClippingSeq.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/IlluminaClippingTrimmer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/IlluminaClippingTrimmer.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/LeadingTrimmer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/LeadingTrimmer.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/MaximumInformationTrimmer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/MaximumInformationTrimmer.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/MinLenTrimmer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/MinLenTrimmer.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/SlidingWindowTrimmer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/SlidingWindowTrimmer.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/ToPhred33Trimmer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/ToPhred33Trimmer.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/ToPhred64Trimmer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/ToPhred64Trimmer.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/TrailingTrimmer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/TrailingTrimmer.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/Trimmer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/Trimmer.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/TrimmerFactory.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/trim/TrimmerFactory.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/util/ConcatGZIPInputStream$1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/util/ConcatGZIPInputStream$1.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/util/ConcatGZIPInputStream$GZIPHelperInputStream.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/util/ConcatGZIPInputStream$GZIPHelperInputStream.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/util/ConcatGZIPInputStream.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/util/ConcatGZIPInputStream.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/util/PositionTrackingInputStream.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/org/usadellab/trimmomatic/util/PositionTrackingInputStream.class -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/trimmomatic/trimmomatic.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/file_type_based/Proc_Reads/trimmomatic/trimmomatic.jar -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/using_subfunc.R.rm_polyAT.R: -------------------------------------------------------------------------------- 1 | ####### clean PolyAT SE 2 | 3 | source("./using_subfunc.R") 4 | pref_lis <- read.table("pref_list", header=F, stringsAsFactors=F) 5 | 6 | for ( i in 1:nrow(pref_lis) ) { 7 | inFq1 <- paste0( pref_lis$V1[i], '_R1.fq' , sep='' ) 8 | oFq1 <- paste0( pref_lis$V1[i], '_trimAT.fq', sep='' ) 9 | adp1 <- 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' 10 | myseq <- adp1 11 | myseq_comp <- chartr('ATGC', 'TACG', myseq) 12 | substr( myseq[1], 1:nchar(myseq[1]), 1:nchar(myseq[1]) ) 13 | x <- strsplit( myseq_comp, '' ) 14 | x <- lapply(x, rev) 15 | myseq_revcomp <- sapply(x, paste, collapse='') 16 | adp1 <- c( adp1, myseq_revcomp ) 17 | clean.pe.fq.file( inFqName1= inFq1, outFqName1= oFq1, adaptor1= adp1, RdPerYield= 10e6 , qual.opts=list( min.qual=0 ) ) 18 | } 19 | 20 | -------------------------------------------------------------------------------- /file_type_based/Proc_Reads/using_subfunc.R.rm_polyAT_useRight.R: -------------------------------------------------------------------------------- 1 | ####### clean PolyAT SE 2 | 3 | source("./using_subfunc.R") 4 | pref_lis <- read.table("pref_list", header=F, stringsAsFactors=F) 5 | 6 | for ( i in 1:nrow(pref_lis) ) { 7 | inFq1 <- paste0( pref_lis$V1[i], '_R1.fq' , sep='' ) 8 | oFq1 <- paste0( pref_lis$V1[i], '_trimAT.fq', sep='' ) 9 | adp1 <- 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' 10 | myseq <- adp1 11 | myseq_comp <- chartr('ATGC', 'TACG', myseq) 12 | substr( myseq[1], 1:nchar(myseq[1]), 1:nchar(myseq[1]) ) 13 | x <- strsplit( myseq_comp, '' ) 14 | x <- lapply(x, rev) 15 | myseq_revcomp <- sapply(x, paste, collapse='') 16 | adp1 <- c( adp1, myseq_revcomp ) 17 | clean.pe.fq.file( inFqName1= inFq1, outFqName1= oFq1, adaptor1= adp1, RdPerYield= 10e6 , qual.opts=list( min.qual=0 ), use.right=TRUE ) 18 | } 19 | 20 | -------------------------------------------------------------------------------- /gm_tools/tomato_loc_V2p4_to_V2p5.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | use warnings; 4 | use fileSunhh; 5 | use mathSunhh; 6 | my $ms_obj = mathSunhh->new(); 7 | use LogInforSunhh; 8 | 9 | !@ARGV and die "perl $0 in_old.agp in_new.agp loci_list\n"; 10 | 11 | my $fn_oldAGP = shift; 12 | my $fn_newAGP = shift; 13 | my $fn_loci = shift; 14 | 15 | my %old_c2s = %{ &fileSunhh::load_agpFile( $fn_oldAGP ) }; 16 | my %new_c2s = %{ &fileSunhh::load_agpFile( $fn_newAGP ) }; 17 | 18 | my %old_s2c = %{ &fileSunhh::reverse_agpHash(\%old_c2s) }; 19 | for my $sID (keys %old_s2c) { 20 | @{$old_s2c{ $sID }} = sort { $a->[0] <=> $b->[0] || $a->[1] <=> $b->[1] } @{$old_s2c{ $sID }}; 21 | } 22 | 23 | my @aa_loci = &fileSunhh::load_tabFile( $fn_loci , 1 ); 24 | for my $a1 (@aa_loci) { 25 | @$a1 == 0 and do { print "chr\tpos\tstr\n"; next; }; 26 | $a1->[0] =~ m!^\s*#! and do { print join("\t", @$a1, qw/chr pos str/)."\n"; next; }; 27 | my ($old_scfID, $old_scfPos) = ($a1->[0], $a1->[1]); 28 | my @new_scfInf = $ms_obj->transfer_position( 'from_ref2qry' => \%old_s2c, 'to_qry2ref' => \%new_c2s, 'fromLoc' => [$old_scfID, $old_scfPos] ); 29 | print join("\t", $old_scfID, $old_scfPos, $new_scfInf[0], $new_scfInf[1], $new_scfInf[2])."\n"; 30 | } 31 | 32 | -------------------------------------------------------------------------------- /log_tools/filt_bwa_log.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | my @pat_to_rm = ('^\[bwa_aln_core\]', '^\[infer_isize\]', '^\[bwa_sai2sam_pe_core\]', '\[bwa_paired_sw\]'); 6 | 7 | my @pat_use; 8 | for (@pat_to_rm) { 9 | push(@pat_use, qr/$_/s ); 10 | } 11 | 12 | -t and !@ARGV and die "perl $0 log.bwa\nTo skip patterns : @pat_use\n"; 13 | 14 | 15 | while (<>) { 16 | my $is_skip = 0; 17 | for my $qPat (@pat_use) { 18 | m!$qPat! and do { $is_skip = 1; last; }; 19 | } 20 | $is_skip or print; 21 | } 22 | 23 | -------------------------------------------------------------------------------- /log_tools/infor_ndupB.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | 6 | print STDOUT join("\t", qw/Prefix Raw_RdPairs Kept_RdPairs/)."\n"; 7 | my %infor; 8 | while (<>) { 9 | m!\[Rec\]! or next; 10 | if ( m!\[Rec\] There are (\d+) read pairs in total \[([^\[\]\s]+)\]! ) { 11 | my %tmp; 12 | $tmp{total_rdPN} = $1; 13 | $tmp{pref} = $2; 14 | if ( defined $infor{total_rdPN} and $infor{total_rdPN} ne '' ) { 15 | print STDOUT join("\t", @infor{qw/pref total_rdPN kept_rdPN/})."\n"; 16 | %infor = (); 17 | } 18 | %infor = %tmp; 19 | } elsif ( m!\[Rec\] There are .+ (\d+) \([\d.]+\%\) reads kept in both! ) { 20 | $infor{kept_rdPN} = $1; 21 | $infor{kept_perC} = $2; 22 | } else { 23 | } 24 | } 25 | 26 | if ( defined $infor{total_rdPN} and $infor{total_rdPN} ne '' ) { 27 | print STDOUT join("\t", @infor{qw/pref total_rdPN kept_rdPN/})."\n"; 28 | %infor = (); 29 | } 30 | 31 | 32 | -------------------------------------------------------------------------------- /pcr_tools/cmd_list: -------------------------------------------------------------------------------- 1 | # Design primers for SNP list (site.list) 2 | perl pcr_tools/retrieve_template_forSNP.pl site.list -out site -ref_fa /Data/Sunhh/marker_design/20210125_mitoMarker/db/CM3.5.1_wiMtCtDNA.fa -min_flank_size 50 3 | perl pcr_tools/run_primer_forSNP.pl -in_tempTab site.tempX.tab -out_prefix site # Got site.primer.tab; 4 | 5 | # Check false priming 6 | less -S site.primer.tab | perl -e 'while (<>) { chomp; $. == 1 and next; my @ta=split(/\t/, $_); print ">$ta[1]_f\n$ta[2]\n>$ta[1]_r\n$ta[3]\n"; }' > chk1.fa 7 | makeblastdb -in /Data/Sunhh/marker_design/20210125_mitoMarker/db/CM3.5.1_wiMtCtDNA.fa -dbtype nucl 8 | bn6 -evalue 5000 -db /Data/Sunhh/marker_design/20210125_mitoMarker/db/CM3.5.1_wiMtCtDNA.fa -task blastn-short -num_threads 50 -query chk1.fa -out chk1.fa.bn6 9 | perl /home/Sunhh/tools/github/NGS_data_processing/pcr_tools/get_priming_loc_bnV2_2_24.pl chk1.fa.bn6 > chk1.fa.bn6.primer_loc 10 | deal_table.pl site.primer.tab -column 1,0,2,4,3,5,15,9,12 > a1 11 | cat chk1.fa.bn6.primer_loc | deal_table.pl -col_uniq 0 > a2 12 | ColLink.pl a1 -f1 a2 -sign 'ok,multi' -add | ColLink.pl -f1 site.list -keyC1 0 -keyC2 1 -add -Col1 2,3,4,5 > a3 13 | 14 | -------------------------------------------------------------------------------- /pcr_tools/site.lis: -------------------------------------------------------------------------------- 1 | Site_ID chr_ID chr_Position Base_Ref Base_indv1 Base_indv2 2 | Site.HS200616.01 chr1 3605732 T TAGACTTTACTAAACGATC T 3 | Site.HS200616.02 chr3 3103223 T TCTAAAAAGTATTCATGTA T 4 | Site.HS200616.03 chr3 4583857 GTTTAAATGTTTTAGCA G GTTTAAATGTTTTAGCA 5 | Site.HS200616.04 chr3 6655932 T T TCCAAAAC 6 | Site.HS200616.05 chr4 28488957 CACCATCACCATA CACCATCACCATA C 7 | Site.HS200616.06 chr6 17888214 GAAAGGGGCGAA GAAAGGGGCGAA G 8 | Site.HS200616.07 chr8 6362269 A A AACTAATTT 9 | Site.HS200616.08 chr8 20036052 ATAGTGTTGTTATGTCC ATAGTGTTGTTATGTCC A 10 | Site.HS200616.09 chr9 2137202 T TGGCTAC T 11 | Site.HS200616.10 chr10 7249290 C C CTTCTTCT 12 | Site.HS200616.11 chr11 10333003 A A ATAAGAAGAACTATC 13 | Site.HS200616.12 chr12 24014358 TGAGTGAGTGAGA T TGAGTGAGTGAGA 14 | -------------------------------------------------------------------------------- /plotting/example_data/65K_DEL-3class: -------------------------------------------------------------------------------- 1 | Group Single copy Two copies 2 | cultivar 354 154 3 | landrace 91 15 4 | wild 239 0 5 | -------------------------------------------------------------------------------- /plotting/example_data/FleshBrix_19YQ_1: -------------------------------------------------------------------------------- 1 | 8.2 2 | 9.4 3 | 9.4 4 | 9.2 5 | 9.8 6 | 8.6 7 | 9.8 8 | 6.4 9 | 10.6 10 | 9.3 11 | 11.4 12 | 9.1 13 | 10.3 14 | 7.2 15 | 8.7 16 | 9.3 17 | 8.7 18 | 7.6 19 | 5.1 20 | 7.2 21 | 7.3 22 | 8.2 23 | 5.7 24 | -------------------------------------------------------------------------------- /plotting/example_data/FleshBrix_19YQ_2: -------------------------------------------------------------------------------- 1 | 9.6 2 | 11.4 3 | 11.1 4 | 10.3 5 | 9.5 6 | 9.8 7 | 7.4 8 | 10.2 9 | -------------------------------------------------------------------------------- /plotting/example_data/FleshBrix_22HN_1: -------------------------------------------------------------------------------- 1 | 10.3 2 | 10.7 3 | 10 4 | 9 5 | 10.2 6 | 9.5 7 | 11.8 8 | 12 9 | 11.3 10 | 10.2 11 | 11.3 12 | 11.5 13 | 10.3 14 | 10.7 15 | 11.3 16 | 11.2 17 | 9 18 | 12 19 | 11.5 20 | 11.3 21 | 10 22 | 11.8 23 | 9 24 | 9 25 | 11 26 | 10.3 27 | 9.8 28 | 10.3 29 | 11.8 30 | 8.5 31 | 10.5 32 | 9.8 33 | 12.2 34 | 7.7 35 | 11.3 36 | 11.2 37 | 11.3 38 | 9 39 | 11.5 40 | 4.3 41 | 10.7 42 | 10.7 43 | 12 44 | 10.8 45 | 10 46 | 6 47 | 9 48 | 13.2 49 | 10.7 50 | 11.3 51 | 11.7 52 | 3.3 53 | 10.3 54 | 4.3 55 | 8.7 56 | 6.8 57 | 8 58 | 5.7 59 | -------------------------------------------------------------------------------- /plotting/example_data/FleshBrix_22HN_2: -------------------------------------------------------------------------------- 1 | 11.8 2 | 10.7 3 | 11 4 | 11.3 5 | 13 6 | 10.8 7 | 12 8 | 11.3 9 | 9.7 10 | 11.3 11 | 11.8 12 | 12.8 13 | 11.5 14 | 11.5 15 | 10.3 16 | 9.8 17 | 9.2 18 | 9.3 19 | 11.7 20 | 12.8 21 | 10.5 22 | 12.2 23 | -------------------------------------------------------------------------------- /plotting/example_data/in.table-for_grouped_barplot_with_SD: -------------------------------------------------------------------------------- 1 | Individual Group Rep. 1 Rep. 2 Rep. 3 2 | RNAi Red 16.85 18.2 21.47 3 | WT Red 32.75 38.36 34.5 4 | OE Red 44.08 49.69 50.03 5 | RNAi Yellow 5.68 8.17 6.57 6 | WT Yellow 15.44 12.39 17.17 7 | OE Yellow 21.06 20.85 23.64 8 | -------------------------------------------------------------------------------- /project/watermelon_pan_phaseI/cmd_list: -------------------------------------------------------------------------------- 1 | # Write command line guide later. 2 | 3 | -------------------------------------------------------------------------------- /project/watermelon_pan_phaseI/cnt_CDS_dup.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | !@ARGV and die "perl $0 in.bn6 > in.bn6.ident_filteredNum\n"; 6 | 7 | my @lvl = reverse(0 .. 100); 8 | 9 | my %h; 10 | while (<>) { 11 | chomp; 12 | my @ta=split(/\t/, $_); 13 | my $glob_matN = $ta[2]*$ta[3]; 14 | for my $l1 (@lvl) { 15 | $glob_matN >= $ta[12] * $l1 or next; 16 | $h{$l1}{$ta[0]} ++; 17 | } 18 | } 19 | for my $l1 (@lvl) { 20 | $h{$l1} //= {}; 21 | my $cnt = scalar(keys %{$h{$l1}}); 22 | print "$l1\t$cnt\n"; 23 | } 24 | 25 | # 26 | # [Sunhh@panda rmTE]$ head -3 1_maker_novCleanRmTEcomplete.c.fa.toRef.bn6 27 | # snap-NODE_38989__1_2429_ext-processed-gene-0.1-mRNA-1 Cla97C10G191640.1 93.83 81 3 2 439 517 295 375 5e-27 121 573 375 plus 28 | # snap-NODE_38989__1_2429_ext-processed-gene-0.1-mRNA-1 Cla97C07G134700.1 93.83 81 3 2 439 517 295 375 5e-27 121 573 375 plus 29 | # snap-NODE_38989__1_2429_ext-processed-gene-0.1-mRNA-1 Cla97C07G133870.1 94.52 73 2 2 439 509 295 367 3e-24 111 573 462 plus 30 | # 31 | -------------------------------------------------------------------------------- /project/watermelon_pan_phaseI/cnvt_ext2nov_to_agp.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | !@ARGV and die "perl $0 CM.ext2nov.tbl > CM.ext2nov.agp\n"; 6 | 7 | while (<>) { 8 | chomp; 9 | my @ta=split(/\t/, $_); 10 | print join("\t", $ta[4], 1, $ta[2]-$ta[1]+1, 1, "W", $ta[0], $ta[1], $ta[2], $ta[3])."\n"; 11 | } 12 | 13 | #==> CM.ori2ext.tbl <== 14 | #NODE_1015 1 47571 + NODE_1015__1_47571_ext WM1147_PI164248 15 | #NODE_10412 1 10272 + NODE_10412__1_10272_ext WM1147_PI164248 16 | #NODE_10828 1 9780 + NODE_10828__1_9780_ext WM1147_PI164248 17 | # 18 | #==> CM.ext2nov.tbl <== 19 | #NODE_1015__1_47571_ext 16875 17446 + WM1147_PI164248_NODE_1015_16875-17446 WM1147_PI164248 20 | #NODE_10412__1_10272_ext 5450 6631 + WM1147_PI164248_NODE_10412_5450-6631 WM1147_PI164248 21 | #NODE_10828__1_9780_ext 352 8977 + WM1147_PI164248_NODE_10828_352-8977 WM1147_PI164248 22 | # AGP : WM97pbV1_Chr06 1 29507460 1 W ClaScf_0005 1 29507460 - Scaffold5 23 | 24 | -------------------------------------------------------------------------------- /project/watermelon_pan_phaseI/extract_N50.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | !@ARGV and die "perl $0 in.fa.N50 in_2.fa.N50 > out.table\n"; 6 | 7 | my @oKey = qw/asm_size asm_num longest asm_N25 asm_N50 asm_N90 asm_N95 asm_N99 shortest gnm_size GN90 GN50/; 8 | print join("\t", 'accession', @oKey)."\n"; 9 | for my $fn (@ARGV) { 10 | my %h; 11 | open F,'<',"$fn" or die; 12 | while () { 13 | m!^Total sequences number.*:\s*(\d+)! and $h{'asm_num'} //= $1; 14 | m!^Total sequences bp \(ATGC\):\s*(\d+)! and $h{'asm_size'} //= $1; 15 | m!^Maximum length \(ATGC\)\s*:\s*(\d+)! and $h{'longest'} //= $1; 16 | m!^Minimum length \(ATGC\)\s*:\s*(\d+)! and $h{'shortest'} //= $1; 17 | m!^N25.+:\s*(\d+)! and $h{'asm_N25'} //= $1; 18 | m!^N50.+:\s*(\d+)! and $h{'asm_N50'} //= $1; 19 | m!^N90.+:\s*(\d+)! and $h{'asm_N90'} //= $1; 20 | m!^N95.+:\s*(\d+)! and $h{'asm_N95'} //= $1; 21 | m!^N99.+:\s*(\d+)! and $h{'asm_N99'} //= $1; 22 | m!^Est\. Genome size\s*:\s*(\d+)! and $h{'gnm_size'} //= $1; 23 | m!^NG50.+:\s*(\d+)! and $h{'GN50'} //= $1; 24 | m!^NG90.+:\s*(\d+)! and $h{'GN90'} //= $1; 25 | } 26 | close F; 27 | $h{'gnm_size'} == 0 and $h{'gnm_size'} = 'NA'; 28 | for my $k1 (@oKey) { 29 | $h{$k1} //= 'NA'; 30 | } 31 | print join("\t", $fn, @h{@oKey})."\n"; 32 | } 33 | 34 | -------------------------------------------------------------------------------- /project/watermelon_pan_phaseI/list_IPRacc.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t and !@ARGV and die "perl $0 ipr_all6_tsv.TEprot.IPRacc > ipr_all6_tsv.TEprot.IPRacc.line\n"; 6 | 7 | my %h; 8 | while (<>) { 9 | chomp; 10 | my @ta=split(/\t/, $_); 11 | $h{$ta[0]}{'ipr'}{$ta[11]} = $ta[12]; 12 | } 13 | for my $id1 (sort keys %h) { 14 | my @k1 = sort keys %{$h{$id1}{'ipr'}}; 15 | my @v1 = @{$h{$id1}{'ipr'}}{@k1}; 16 | print join("\t", $id1, join(";;", @k1), join(";;", @v1))."\n"; 17 | } 18 | -------------------------------------------------------------------------------- /project/watermelon_pan_phaseI/pipe_for_functional_annotation.pl: -------------------------------------------------------------------------------- 1 | /home/Sunhh/tools/github/NGS_data_processing/annot_tools/ahrd/pipe_for_functional_annotation.pl -------------------------------------------------------------------------------- /project/watermelon_pan_phaseI/ret_maker_abinit_gff3.pl: -------------------------------------------------------------------------------- 1 | ../../annot_tools/maker/ret_maker_abinit_gff3.pl -------------------------------------------------------------------------------- /project/watermelon_pan_phaseI/rm_Qloc_only_groups.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t and !@ARGV and die "perl $0 ov1/comb.grp2.novl_loc.syn.grp_tbl > ov1/comb.grp2.novl_loc.syn.grp_tbl.filtered\n"; 6 | 7 | while (<>) { 8 | chomp; 9 | my @ta=split(/\t/, $_); 10 | my $is_good = 0; 11 | for my $tb (@ta[2..$#ta]) { 12 | $tb =~ m!^\S+:\d+\-\d+:[+-]$! and next; 13 | $is_good = 1; 14 | } 15 | $is_good == 1 or next; 16 | print "$_\n"; 17 | } 18 | 19 | # cat ov1/comb.grp2.novl_loc.syn.grp_tbl | perl -e 'while (<>) { chomp; m!\t\S+:\d+\-\d+:[+-](\t|$)! or next; m!\tC.pan:[^\s:]+(\t|$)! and next; print "$_\n"; }' | wc -l 20 | # GrpSyn_000002 21 CApan:CaUC03G061460.1 CApan:CaUC03G061520.1 CApan:CaUC03G061550.1 CApan:CaUC03G061560.1 CApan:Ciama_Chr03:24544226-24545011:+ CApan:Ciama_Chr03:30256016-30256508:+ 21 | # GrpSyn_000003 19 CApan:CaUC10G186420.1 CApan:CaUC10G186430.1 CApan:Ciama_Chr10:12115704-12116238:+ CLpan:Cla97C10G191540.1 CLpan:Cla97C10G191550.1 CLpan:Cla97C10G191553.1 CLpan:Cla97C10G191557.1 22 | 23 | -------------------------------------------------------------------------------- /reseq_tools/C_exe/maskClose_in_1col: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/reseq_tools/C_exe/maskClose_in_1col -------------------------------------------------------------------------------- /reseq_tools/C_exe/rmSameSite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/reseq_tools/C_exe/rmSameSite -------------------------------------------------------------------------------- /reseq_tools/SNP_effect.pl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/reseq_tools/SNP_effect.pl -------------------------------------------------------------------------------- /reseq_tools/SNP_effect_edit.pl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/reseq_tools/SNP_effect_edit.pl -------------------------------------------------------------------------------- /reseq_tools/bsa/example_data/template-QTLseqr_result.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/reseq_tools/bsa/example_data/template-QTLseqr_result.xlsx -------------------------------------------------------------------------------- /reseq_tools/bsa/scripts/slct_sites_by_windows.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use fileSunhh; 5 | 6 | !@ARGV and die "perl $0 position_list site.al\n"; 7 | 8 | my $fn1 = shift; # windows; 9 | my $fn2 = shift; # sties; 10 | 11 | my $fh1 = &openFH($fn1, '<'); 12 | my %required; 13 | while (<$fh1>) { 14 | chomp; 15 | my @ta=split(/\t/, $_); 16 | push(@{$required{$ta[0]}}, [@ta[1,2]]); 17 | } 18 | close($fh1); 19 | for my $k1 (keys %required) { 20 | @{$required{$k1}} = sort { $a->[0] <=> $b->[0] || $a->[1] <=> $b->[1] } @{$required{$k1}}; 21 | } 22 | 23 | my $fh2 = &openFH($fn2, '<'); 24 | while (<$fh2>) { 25 | chomp; 26 | my @ta=split(/\t/, $_); 27 | if ($. == 1 and $ta[0] =~ m!^(chrom$|chr$)!i and $ta[1] =~ m!^(pos$|position$)!i) { 28 | print STDOUT "$_\n"; 29 | next; 30 | } 31 | defined $required{$ta[0]} or next; 32 | my $is=0; 33 | for my $a1 (@{$required{$ta[0]}}) { 34 | $a1->[0] > $ta[1] and last; 35 | $a1->[1] < $ta[1] and next; 36 | $is = 1; 37 | last; 38 | } 39 | $is == 1 and print STDOUT "$_\n"; 40 | 41 | } 42 | close($fh2); 43 | 44 | -------------------------------------------------------------------------------- /reseq_tools/cnt_genotype_in_1col.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use fileSunhh; 5 | use LogInforSunhh; 6 | 7 | -t and !@ARGV and &LogInforSunhh::usage("\nperl $0 sample.1col > sample.1col.typeC\n\n"); 8 | 9 | my $fh = \*STDIN; 10 | 11 | @ARGV > 0 and $fh = &openFH($ARGV[0], '<'); 12 | 13 | my $head = <$fh>; 14 | my %cnt; 15 | while (<$fh>) { 16 | m/^\s*$/ and next; 17 | $cnt{'total'} ++; 18 | if ( m/^([ATGC])\1*$/ ) { 19 | $cnt{"HomoBase"} ++; 20 | } elsif ( m/^N+$/i ) { 21 | $cnt{"N"} ++; 22 | } elsif ( m/^[ATGC][ATGC]$/ ) { 23 | $cnt{"DiHete"} ++; 24 | } elsif ( m/^[ATGC\*][ATGC\*]$/ ) { 25 | $cnt{'heteDel'} ++; 26 | } elsif ( m/^\*+$/ ) { 27 | $cnt{"homoDel"} ++; 28 | } elsif ( m/^[ATGCN]\+/) { 29 | $cnt{"homoIns"} ++; 30 | } elsif ( m/^[ATGCN*][ATGCN]+/ ) { 31 | $cnt{'heteIns'} ++; 32 | } else { 33 | $cnt{"Other"} ++; 34 | } 35 | } 36 | 37 | chomp($head); 38 | print join("\t", 'Type', $head)."\n"; 39 | for (qw/total N HomoBase homoDel heteDel homoIns heteIns DiHete Other/) { 40 | $cnt{$_} //= 0; 41 | print join("\t", $_, $cnt{$_})."\n"; 42 | } 43 | -------------------------------------------------------------------------------- /reseq_tools/cnt_pileup_depC.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | use LogInforSunhh; 6 | 7 | for (@ARGV) { 8 | &exeCmd("awk '\$3 != \"N\" \&\& \$1 != \"CG_Chr00\" { print \$4 }' $_ | deal_table.pl -col_repCount 0 | deal_table.pl -col_sort 1 > $_.chr.noN.depC"); 9 | } 10 | -------------------------------------------------------------------------------- /reseq_tools/cnvt_tools/cols2LD.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | 6 | my $pl_dir = '/data/Sunhh/watermelon/source_reseq/new_source/04.LD/hap_LD'; 7 | my $pl_rmSame = "$pl_dir/" . 'rm_same_site_hete2N.pl'; 8 | my $pl_mkHap = "$pl_dir/" . 'tbl2hap.pl'; 9 | my $pl_binLD = "$pl_dir/" . 'get_the_LD_decay_file.pl'; 10 | 11 | !@ARGV and die "perl $0 in.snp\n"; 12 | 13 | my $snpF = shift; 14 | 15 | &exeCmd_1cmd("perl $pl_rmSame $snpF > $snpF.var"); 16 | &exeCmd_1cmd("perl $pl_mkHap $snpF.var $snpF.ped $snpF.info"); 17 | &exeCmd_1cmd("java -Xmx20960M -jar /data/Sunhh/src/Evolution/haploview/Haploview4.1.jar -nogui -minMAF 0.05 -hwcutoff 0.001 -dprime -log $snpF.log -out $snpF -pedfile $snpF.ped -info $snpF.info"); 18 | &exeCmd_1cmd("perl $pl_binLD $snpF.LD 1000 $snpF.LD_bin1k"); 19 | 20 | &tsmsg("[Rec] Done.\n"); 21 | 22 | -------------------------------------------------------------------------------- /reseq_tools/cnvt_tools/cols2meg.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | use SNP_tbl; 4 | 5 | !@ARGV and die "perl $0 in.snp\n"; 6 | 7 | my $inF = shift; 8 | my $st = SNP_tbl->new(filename=>$inF); 9 | $st->readTbl(); 10 | $st->tbl2meg(ofile=>"$inF.meg"); 11 | 12 | 13 | -------------------------------------------------------------------------------- /reseq_tools/cnvt_tools/fas2meg.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use fastaSunhh; 5 | my $fs_obj = fastaSunhh->new(); 6 | use LogInforSunhh; 7 | 8 | !@ARGV and die "perl $0 in.fas > out.meg\n"; 9 | 10 | my $file = shift; 11 | my %s2h = %{ $fs_obj->save_seq_to_hash( 'faFile'=>$file, 'has_head'=>1 ) }; 12 | print STDOUT < $s2h{$b}{'Order'} } keys %s2h) { 19 | $s2h{$tk}{'seq'} =~ s/\s//gs; 20 | $s2h{$tk}{'seq'} =~ s/(.{100})/$1\n/g; chomp( $s2h{$tk}{'seq'} ); 21 | print STDOUT "#$s2h{$tk}{'key'}\n$s2h{$tk}{'seq'}\n"; 22 | } 23 | 24 | -------------------------------------------------------------------------------- /reseq_tools/cnvt_tools/qopt2shrtStructure.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t !@ARGV and die "perl $0 usetax_var_K_2.qopt > usetax_var_K_2.qopt_f\n"; 6 | 7 | 8 | while (<>) { 9 | chomp; 10 | my @ta = split(/\s+/, $_); 11 | print join(" ", $., $., "(0)", 1, ":" , @ta)."\n"; 12 | } 13 | 14 | 15 | -------------------------------------------------------------------------------- /reseq_tools/cnvt_tools/tbl2LD.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | 6 | my $pl_dir = '/data/Sunhh/watermelon/source_reseq/new_source/04.LD/hap_LD'; 7 | my $pl_rmSame = "$pl_dir/" . 'rm_same_site_hete2N.pl'; 8 | my $pl_mkHap = "$pl_dir/" . 'tbl2hap.pl'; 9 | my $pl_binLD = "$pl_dir/" . 'get_the_LD_decay_file.pl'; 10 | 11 | !@ARGV and die "perl $0 in.snp\n"; 12 | 13 | my $snpF = shift; 14 | 15 | &exeCmd_1cmd("perl $pl_rmSame $snpF > $snpF.var"); 16 | &exeCmd_1cmd("perl $pl_mkHap $snpF.var $snpF.ped $snpF.info"); 17 | &exeCmd_1cmd("java -Xmx20960M -jar /data/Sunhh/src/Evolution/haploview/Haploview4.1.jar -nogui -minMAF 0.05 -hwcutoff 0.001 -dprime -log $snpF.log -out $snpF -pedfile $snpF.ped -info $snpF.info"); 18 | &exeCmd_1cmd("perl $pl_binLD $snpF.LD 1000 $snpF.LD_bin1k"); 19 | 20 | &tsmsg("[Rec] Done.\n"); 21 | 22 | -------------------------------------------------------------------------------- /reseq_tools/cnvt_tools/tbl2meg.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | use SNP_tbl; 4 | 5 | !@ARGV and die "perl $0 in.snp\n"; 6 | 7 | my $inF = shift; 8 | my $st = SNP_tbl->new(filename=>$inF); 9 | $st->readTbl(); 10 | $st->tbl2meg(ofile=>"$inF.meg"); 11 | 12 | 13 | -------------------------------------------------------------------------------- /reseq_tools/example_data/list.long_deletions: -------------------------------------------------------------------------------- 1 | 1_608963_v75845_65251 CLV01_Chr01 608964 674214 2 | 1_671689_v81114_65251 CLV01_Chr01 671690 736940 3 | 1_8330539_v1378562_21344 CLV01_Chr01 8330540 8351883 4 | 1_8341886_v1380573_21684 CLV01_Chr01 8341887 8363570 5 | 1_9330197_v1526693_21740 CLV01_Chr01 9330198 9351937 6 | 1_9437841_v1543167_24382 CLV01_Chr01 9437842 9462223 7 | 1_11693114_v1846227_70613 CLV01_Chr01 11693115 11763727 8 | 1_11718294_v1848597_23199 CLV01_Chr01 11718295 11741493 9 | 1_11719323_v1848659_35854 CLV01_Chr01 11719324 11755177 10 | 1_12085642_v1884138_23144 CLV01_Chr01 12085643 12108786 11 | -------------------------------------------------------------------------------- /reseq_tools/example_data/list.sample_bam: -------------------------------------------------------------------------------- 1 | ARO18917 bam/ARO18917.dedup.bam 2 | ARO18920 bam/ARO18920.dedup.bam 3 | ARO19494 bam/ARO19494.dedup.bam 4 | ARO20587 bam/ARO20587.dedup.bam 5 | ARO21031 bam/ARO21031.dedup.bam 6 | ARO22357 bam/ARO22357.dedup.bam 7 | ARO22359 bam/ARO22359.dedup.bam 8 | ARO23071 bam/ARO23071.dedup.bam 9 | ARO23967 bam/ARO23967.dedup.bam 10 | bulldog2017 bam/bulldog2017.dedup.bam 11 | -------------------------------------------------------------------------------- /reseq_tools/example_data/out_geno-mat.tab: -------------------------------------------------------------------------------- 1 | Sample 1_608963_v75845_65251 1_671689_v81114_65251 1_8330539_v1378562_21344 1_8341886_v1380573_21684 1_9330197_v1526693_21740 1_9437841_v1543167_24382 1_11693114_v1846227_70613 1_11718294_v1848597_23199 1_11719323_v1848659_35854 1_12085642_v1884138_23144 2 | ARO18917 1/1 0/0 0/0 0/0 0/0 0/0 1/1 1/1 1/1 1/1 3 | ARO18920 1/1 0/0 0/0 0/0 0/0 0/0 1/1 1/1 1/1 1/1 4 | ARO19494 1/1 0/0 0/0 0/0 0/0 0/0 1/1 1/1 1/1 1/1 5 | ARO20587 1/1 0/0 0/0 0/0 0/0 0/0 1/1 1/1 0/0 1/1 6 | ARO21031 1/1 0/0 0/0 0/0 0/0 0/0 1/1 1/1 0/0 1/1 7 | ARO22357 1/1 0/0 1/1 0/0 0/0 0/0 1/1 1/1 0/0 1/1 8 | ARO22359 1/1 0/0 1/1 0/0 0/0 0/0 1/1 1/1 1/1 1/1 9 | ARO23071 1/1 0/0 0/0 0/0 0/0 0/0 1/1 1/1 1/1 1/1 10 | ARO23967 1/1 0/0 0/0 0/0 0/0 0/0 1/1 1/1 1/1 1/1 11 | bulldog2017 1/1 0/0 0/0 0/0 0/0 0/0 1/1 1/1 0/0 0/0 12 | -------------------------------------------------------------------------------- /reseq_tools/extract_pileup.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | !@ARGV and die "perl $0 fsadf\n"; 5 | 6 | for (`ls *.pileup`) { 7 | chomp; 8 | print "Start $_;" . scalar(localtime()) . " \n"; 9 | system "uniqComb.pl $_ -index ../basic_SNP_WM97toPI -col 0,1 -newCol 0,1 -exist > $_.basic"; 10 | print "End $_;" . scalar(localtime()) . "\n"; 11 | } 12 | print "All extractions are over.\n"; 13 | 14 | -------------------------------------------------------------------------------- /reseq_tools/filter_tools/cnt_depth/sumDepBySite.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | while (<>) { 6 | chomp; 7 | my @ta = split(/\t/, $_); 8 | if ($ta[0] eq '#CHROM') { 9 | print join("\t", @ta[0,1], 'DP_SUM')."\n"; 10 | next; 11 | } 12 | my $sumDepth = 0; 13 | for my $cnt (@ta[2 .. $#ta]) { 14 | $cnt eq '.' and next; 15 | $sumDepth += $cnt; 16 | } 17 | print join("\t", @ta[0,1], $sumDepth)."\n"; 18 | } 19 | -------------------------------------------------------------------------------- /reseq_tools/fst/extract_top1_fst_site.R: -------------------------------------------------------------------------------- 1 | # ==> set01_13_to_11.fst.perSiteChrPos <== 2 | # chr pos Ho Hs Ht Dst Htp Dstp Fst Fstp Fis Dest 3 | # chr10 467 0 0 0 0 0 0 NA NA NA 0 4 | 5 | argvs <- commandArgs( trailingOnly=TRUE ) ; 6 | fn <- argvs[1] # in.fst.perSiteChrPos 7 | topR <- as.numeric( argvs[2] ) # 0.01 8 | if (topR > 1) { 9 | print (topR) 10 | quit() 11 | } 12 | 13 | aa <- read.table( file=fn, header=T, stringsAsFactors=F ) 14 | # Filter out organelles and NA sites. 15 | aa.kk <- aa$chr != "plast" & aa$chr != "mito" & !is.na(aa$Fst) & aa$Fst >= 0 16 | aa <- aa[ aa.kk, ] 17 | aa.kk <- NULL 18 | 19 | # Find the threshold of topR 20 | aa.qt <- quantile( aa$Fst, probs=c(0,0.5,0.95,1-topR,1) ) 21 | aa.thres <- aa.qt[4] 22 | cat("threshold for", topR, "is ", aa.thres, "\n") 23 | 24 | # Get the selected sites. 25 | aa.slct <- aa[ aa$Fst >= aa.thres, ] 26 | write.table( aa.slct, file=paste0( fn, ".top", topR, sep=""), append=F, row.names=F, col.names=T, quote=F, sep="\t" ) 27 | 28 | 29 | -------------------------------------------------------------------------------- /reseq_tools/fst/get_stat.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | !@ARGV and die "perl $0 in.win.fst\n"; 6 | 7 | my $f = shift; 8 | 9 | system "awk ' NR > 1 && ( \$4 >= 50 && \$6 >= 0 ) ' $f | deal_table.pl -col_stat 5 -col_stat_AsINS | deal_table.pl -transpose > $f.statMean"; 10 | 11 | -------------------------------------------------------------------------------- /reseq_tools/fst/join_fst_siteChrPos.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | use fileSunhh; 6 | use Getopt::Long; 7 | my %opts; 8 | GetOptions(\%opts, 9 | "help!", 10 | "perWind!", 11 | "addSuff:s", '' 12 | ); 13 | $opts{'addSuff'} //= ''; 14 | 15 | my $help_txt = < merged.ChrPos 18 | 19 | -perWind [Bool] 20 | -addSuff [''] 21 | 22 | HH 23 | 24 | -t and !@ARGV and &LogInforSunhh::usage($help_txt); 25 | $opts{'help'} and &LogInforSunhh::usage($help_txt); 26 | 27 | my %used; 28 | while (my $fn = <>) { 29 | chomp($fn); 30 | my @ta = split(/\t/, $fn); 31 | my $fh = &openFH("$ta[0]$opts{'addSuff'}", '<'); 32 | while (<$fh>) { 33 | chomp; 34 | my @tb = split(/\t/, $_); 35 | my $tk = ($opts{'perWind'}) ? $tb[0] : "$tb[0]\t$tb[1]"; 36 | defined $used{$tk} and next; 37 | $used{$tk} = 1; 38 | print STDOUT "$_\n"; 39 | } 40 | close($fh); 41 | } 42 | 43 | -------------------------------------------------------------------------------- /reseq_tools/gatk/CatVariants.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use fileSunhh; 5 | 6 | -t and !@ARGV and die "perl $0 sorted_GVCF_list\n"; 7 | 8 | my $has_header = 0; 9 | 10 | while (my $l = <>) { 11 | chomp($l); 12 | $l =~ m!^\s*($|#)! and next; 13 | my ($fn) = (&splitL("\t", $l))[0]; 14 | my $ifh = &openFH($fn, '<'); 15 | while (<$ifh>) { 16 | if (m!^#!) { 17 | $has_header == 1 and next; 18 | m!^#CHROM\t! and $has_header = 1; 19 | print STDOUT $_; 20 | next; 21 | } 22 | print STDOUT $_; 23 | } 24 | close($ifh); 25 | } 26 | -------------------------------------------------------------------------------- /reseq_tools/gatk/get_pass_vcf.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use fileSunhh; 5 | 6 | -t and !@ARGV and die "gzip -cd lmyPM_filtV.vcf.gz | perl $0 > lmyPM_filtV_PASS.vcf\n"; 7 | 8 | while (<>) { 9 | if (m!^#!) { 10 | print STDOUT $_; 11 | next; 12 | } 13 | chomp; 14 | my @ta = split(/\t/, $_); 15 | $ta[6] =~ m!^PASS$!i or next; 16 | print STDOUT "$_\n"; 17 | } 18 | 19 | -------------------------------------------------------------------------------- /reseq_tools/get_set3_varWiIndel.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | use Getopt::Long; 6 | my %opts; 7 | GetOptions(\%opts, 8 | "help!", 9 | "startColN:i", # 2 10 | "noHeader!", 11 | ); 12 | 13 | $opts{'startColN'} //= 2; 14 | 15 | sub usage { 16 | print STDERR < out_woIndel.snp 19 | 20 | -help 21 | -startColN [$opts{'startColN'}] 22 | -noHeader [Bool] 23 | 24 | HH 25 | exit(1); 26 | } 27 | 28 | -t and !@ARGV and &usage(); 29 | $opts{'help'} and &usage(); 30 | 31 | while (<>) { 32 | $. % 1e6 == 1 and &tsmsg("[Msg] Reading $. lines.\n"); 33 | s/[^\S\t]+$//; 34 | if ( $. == 1 and !$opts{'noHeader'} ) { 35 | print "$_\n"; 36 | next; 37 | } 38 | my @ta = split(/\t/, $_); 39 | my $has_indel = 0; 40 | for my $tb (@ta[ $opts{'startColN'} .. $#ta ]) { 41 | $tb =~ m/\*|\+/ and do { $has_indel = 1; last; }; 42 | } 43 | $has_indel == 1 and print "$_\n"; 44 | } 45 | 46 | 47 | -------------------------------------------------------------------------------- /reseq_tools/mao_exe/combine2PileFiles: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/reseq_tools/mao_exe/combine2PileFiles -------------------------------------------------------------------------------- /reseq_tools/mao_exe/reSeqPrintRefChr: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/reseq_tools/mao_exe/reSeqPrintRefChr -------------------------------------------------------------------------------- /reseq_tools/mao_exe/reSeqPrintSample.indel.fast: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/reseq_tools/mao_exe/reSeqPrintSample.indel.fast -------------------------------------------------------------------------------- /reseq_tools/mao_exe/reSeqPrintSample.indel.fast.strAssign: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/reseq_tools/mao_exe/reSeqPrintSample.indel.fast.strAssign -------------------------------------------------------------------------------- /reseq_tools/mao_exe/reSeqPrintSample.indel.fast.strAssign.moreHeter: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/reseq_tools/mao_exe/reSeqPrintSample.indel.fast.strAssign.moreHeter -------------------------------------------------------------------------------- /reseq_tools/mao_exe/rmRedunSam2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/reseq_tools/mao_exe/rmRedunSam2 -------------------------------------------------------------------------------- /reseq_tools/mao_exe/rmRedunSam3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/reseq_tools/mao_exe/rmRedunSam3 -------------------------------------------------------------------------------- /reseq_tools/mask_weiredSNP.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | use Getopt::Long; 6 | my %opts; 7 | GetOptions(\%opts, 8 | "help!", 9 | "startColN:i", # 10 | ); 11 | 12 | my $geno_col = 2; 13 | 14 | sub usage { 15 | print STDERR < merged.snp.woWeired 20 | 21 | Please note that the geno_col is $geno_col 22 | And first line is not checked. 23 | 24 | HH 25 | exit(1); 26 | } 27 | 28 | !@ARGV and &usage(); 29 | 30 | my $l = <>; 31 | print $l; 32 | # print join("\t",qw/ChromID Pos GenoN HomoRatio HeteRatio/)."\n"; 33 | while (<>) { 34 | $. % 1e6 == 1 and &tsmsg("line $.\n"); 35 | chomp; 36 | my @ta = split(/\t/, $_); 37 | for my $tb ( @ta[$geno_col .. $#ta] ) { 38 | $tb =~ m/^[ATGC*N]$|^[ATGC]\+[ATGC]+$|^[ATGC*][ATGC*]$/ or $tb = "N"; 39 | # Sometimes there will be sth. like 'AG+AAA' heterozygous site, but I don't want it . 40 | } 41 | print join("\t", @ta)."\n"; 42 | } 43 | -------------------------------------------------------------------------------- /reseq_tools/rename_plink_map.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t and !@ARGV and die "perl $0 plink_fmt.map plink_IDnamed.map\n"; 6 | 7 | 8 | my %h; 9 | while (<>) { 10 | chomp; 11 | my @ta=split(/\t/, $_); 12 | $ta[1] eq "." and $ta[1] = "$ta[0]_$ta[3]"; 13 | defined $h{$ta[1]} and die "[Err] Repeat site ID [$ta[1]]\n"; 14 | $h{$ta[1]} = 1; 15 | print join("\t", @ta)."\n"; 16 | } 17 | -------------------------------------------------------------------------------- /reseq_tools/rm_adjacent_sites.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | 6 | # Rules: 7 | # R1. Remove adjacent SNP sites within 5bp. 8 | 9 | my $within_dist = 5; 10 | 11 | my %prev; 12 | while (<>) { 13 | s/[^\t\S]+$//; 14 | my @ta = split(/\t/, $_); 15 | my ($chr, $pos, $refB) = @ta[0,1,2]; 16 | if ($chr eq 'chr') { 17 | print STDOUT "$_\n"; 18 | next; 19 | } 20 | 21 | # Rule 1: 22 | my %curr; 23 | $curr{'chr'} = $chr; 24 | $curr{'pos'} = $pos; 25 | $curr{line} = $_; 26 | $curr{is_good} = 1; 27 | if (scalar(keys %prev) == 0 or $prev{chr} ne $chr) { 28 | defined $prev{'is_good'} and $prev{'is_good'} == 1 and print STDOUT "$prev{'line'}\n"; 29 | } else { 30 | my $dist2prev = $pos - $prev{pos}+1; 31 | if ( $dist2prev <= $within_dist ) { 32 | # Both are bad. 33 | $curr{is_good} = 0; 34 | # $prev{is_good} = 0; 35 | } else { 36 | $prev{is_good} == 1 and print STDOUT "$prev{line}\n"; 37 | } 38 | } 39 | %prev = %curr; 40 | } 41 | $prev{is_good} == 1 and print STDOUT "$prev{line}\n"; 42 | -------------------------------------------------------------------------------- /reseq_tools/rm_same_site.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | 6 | my $geno_col = 2; 7 | 8 | -t and !@ARGV and die "perl $0 in_pasted_1col.tbl > snp.tbl\nPlease note that the geno_col is $geno_col\n"; 9 | 10 | 11 | while (<>) { 12 | $. % 1e6 == 1 and &tsmsg("[Msg] Reading $. lines.\n"); 13 | s/[^\S\t]+$//; 14 | my @ta = split(/\t/, $_); 15 | if ( $ta[0] eq 'chr' ) { 16 | print "$_\n"; 17 | next; 18 | } 19 | my $base = 'N'; 20 | my $has_diff = 0; 21 | for (my $i=$geno_col; $i<@ta; $i++) { 22 | $ta[$i] = uc($ta[$i]); 23 | $ta[$i] eq 'N' and next; 24 | $base eq 'N' and $base = $ta[$i]; 25 | $base ne $ta[$i] and do { $has_diff = 1; last; }; 26 | } 27 | $has_diff == 1 and print "$_\n"; 28 | } 29 | -------------------------------------------------------------------------------- /reseq_tools/rm_same_site_hete2N.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | 6 | my $geno_col = 2; 7 | -t and !@ARGV and die "perl $0 in_wiSame.snp\nPlease note that geno_col=$geno_col\nHere we treat heterozygous site as 'N', with indel accepted.\n"; 8 | 9 | 10 | while (<>) { 11 | $. % 1e6 == 1 and &tsmsg("[Msg] Reading $. lines.\n"); 12 | s/[^\S\t]+$//; 13 | my @ta = split(/\t/, $_); 14 | if ( $ta[0] eq 'chr' ) { 15 | print "$_\n"; 16 | next; 17 | } 18 | my $base = 'N'; 19 | my $has_diff = 0; 20 | for (my $i=$geno_col; $i<@ta; $i++) { 21 | $ta[$i] = uc($ta[$i]); 22 | $ta[$i] =~ m/^[ATGC]$|\*|\+/ or $ta[$i] = 'N'; 23 | $ta[$i] eq 'N' and next; 24 | $base eq 'N' and $base = $ta[$i]; 25 | $base ne $ta[$i] and do { $has_diff = 1; last; }; 26 | } 27 | $has_diff == 1 and print "$_\n"; 28 | } 29 | -------------------------------------------------------------------------------- /reseq_tools/scripts/cnvt_melt_to_matrix.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t and !@ARGV and die "perl $0 in.melt > out.matrix\n"; 6 | 7 | # 1_608963_v75845_65251 ARO18917 1/1 14 0 8 | # 1_608963_v75845_65251 ARO18920 1/1 45 0 9 | # 1_608963_v75845_65251 ARO19494 1/1 24 0 10 | # 1_608963_v75845_65251 ARO20587 1/1 16 0 11 | 12 | my (%col1, %col2, %col_val); 13 | 14 | while (<>) { 15 | my @ta=split(/\t/, $_); 16 | chomp(@ta); 17 | $col1{$ta[0]} //= $.; 18 | $col2{$ta[1]} //= $.; 19 | $col_val{$ta[0]}{$ta[1]} //= $ta[2]; 20 | } 21 | my @arr1 = sort {$col1{$a}<=>$col1{$b}} keys %col1; 22 | my @arr2 = sort {$col2{$a}<=>$col2{$b}} keys %col2; 23 | print STDOUT join("\t", "Sample", @arr1)."\n"; 24 | for my $a2 (@arr2) { 25 | my @o = ($a2); 26 | for my $a1 (@arr1) { $col_val{$a1}{$a2} //= "./."; push(@o, $col_val{$a1}{$a2}); } 27 | print STDOUT join("\t", @o)."\n"; 28 | } 29 | 30 | -------------------------------------------------------------------------------- /reseq_tools/slct_sweep/merge_wind.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use mathSunhh; 5 | my $mat_obj = mathSunhh->new(); 6 | 7 | my %raw_blks; 8 | my %ord; 9 | while (<>) { 10 | chomp; 11 | my @ta = split(/\t/, $_); 12 | $ta[1] eq 'chrS' and next; 13 | $ord{$ta[0]} //= $.; 14 | push(@{$raw_blks{$ta[0]}}, [$ta[1], $ta[2]]); 15 | } 16 | my %merged_blks; 17 | print STDOUT join("\t", qw/chr start end/)."\n"; 18 | for my $tk (sort {$ord{$a} <=> $ord{$b}} keys %raw_blks) { 19 | $merged_blks{$tk} = $mat_obj->mergeLocBlk( $raw_blks{$tk} ); 20 | for my $tr1 (@{$merged_blks{$tk}}) { 21 | print STDOUT join("\t", $tk, $tr1->[0], $tr1->[1])."\n"; 22 | } 23 | } 24 | 25 | -------------------------------------------------------------------------------- /reseq_tools/slct_sweep/merge_wind_pos.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | use mathSunhh; 6 | use Getopt::Long; 7 | my %opts; 8 | GetOptions(\%opts, 9 | "dist2join:i", # 1 10 | "help!", 11 | ); 12 | $opts{'dist2join'} //= 1; 13 | 14 | my $help_txt = < merged_loc 17 | 18 | -dist2join [$opts{'dist2join'}] 19 | 20 | HH 21 | 22 | -t and !@ARGV and &LogInforSunhh::usage($help_txt); 23 | $opts{'help'} and &LogInforSunhh::usage($help_txt); 24 | 25 | my $mat_obj = mathSunhh->new(); 26 | 27 | my %raw_blks; 28 | my %ord; 29 | while (<>) { 30 | chomp; 31 | my @ta = split(/\t/, $_); 32 | $ta[1] =~ m/^[\d\.]+$/ or next; 33 | $ord{$ta[0]} //= $.; 34 | $ta[2] //= $ta[1]; 35 | push(@{$raw_blks{$ta[0]}}, [$ta[1], $ta[2]]); 36 | } 37 | my %merged_blks; 38 | print STDOUT join("\t", qw/chr start end/)."\n"; 39 | for my $tk (sort {$ord{$a} <=> $ord{$b}} keys %raw_blks) { 40 | $merged_blks{$tk} = $mat_obj->mergeLocBlk( $raw_blks{$tk}, 'dist2join'=>$opts{'dist2join'} ); 41 | for my $tr1 (@{$merged_blks{$tk}}) { 42 | print STDOUT join("\t", $tk, $tr1->[0], $tr1->[1])."\n"; 43 | } 44 | } 45 | 46 | -------------------------------------------------------------------------------- /reseq_tools/slct_sweep/rm_overlap_wind.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | # ChromID WindS WindE WindL BpCnt perKb_0 6 | # SpoScf_00001 1 10000 10000 10000 0.0924198651487334 7 | # SpoScf_00001 1001 11000 10000 10000 0.0439350166638849 8 | # SpoScf_00001 2001 12000 10000 10000 0.0348612437039331 9 | 10 | my @prev; 11 | while (<>) { 12 | chomp; 13 | my @ta = split(/\t/, $_); 14 | if (!@prev or $prev[0] ne $ta[0]) { 15 | print "$_\n"; 16 | @prev = @ta; 17 | next; 18 | } 19 | $prev[2] >= $ta[1] and next; 20 | print "$_\n"; 21 | @prev = @ta; 22 | } 23 | 24 | -------------------------------------------------------------------------------- /reseq_tools/slct_sweep/rod_from_PIavg.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | my $avgC1 = 5; # high 6 | my $avgC2 = 6; # low 7 | 8 | -t and !@ARGV and die "paste filt1_w50ks5k.pi.gCC.PIavg filt1_w50ks5k.pi.gCA.PIavg | deal_table.pl -column 0-5,11 | perl $0 > CC_CA.avgComp\n"; 9 | 10 | while (<>) { 11 | chomp; 12 | my @ta=split(/\t/, $_); 13 | if ($ta[0] eq 'ChromID') { 14 | print STDOUT join("\t", qw/CHROM BIN_START BIN_END BpCnt perKb_High perKb_Low MEAN_Est/)."\n"; 15 | next; 16 | } 17 | my $est = ( $ta[$avgC1]+$ta[$avgC2] > 0 ) ? ($ta[$avgC1]/($ta[$avgC1]+$ta[$avgC2])) : 'NA' ; 18 | print STDOUT join("\t", @ta[0,1,2,4,$avgC1, $avgC2], $est)."\n"; 19 | } 20 | -------------------------------------------------------------------------------- /reseq_tools/tassel/cnvt_col_to_TasselTaxaList.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | print STDOUT <) { 13 | chomp; 14 | my @ta=split(/\t/, $_); 15 | $cnt ++; 16 | my $taxaTxt = <) { 11 | chomp; 12 | m!^\s*##! and do { print "$_\n"; next; }; 13 | my @ta = split(/\t/, $_); 14 | if (m!^\s*#CHROM!) { 15 | print join("\t", @ta[0..8], $id, @ta[9..$#ta])."\n"; 16 | next; 17 | } 18 | $ta[7] = '.'; 19 | my $ii; 20 | if ($ta[8] =~ m!^GT(:|$)!) { 21 | $ii = 0; 22 | } else { 23 | my @tc = split(/:/, $ta[8]); 24 | for (my $i0=0; $i0<@tc; $i0++) { 25 | $tc[$i0] eq 'GT' and do { $ii = $i0; last; }; 26 | } 27 | } 28 | defined $ii or do { &tsmsg("[Err][Wrn] bad FORMAT: [$ta[8]]: $_\n"); next; }; 29 | $ta[8] = 'GT'; 30 | for my $tb (@ta[9..$#ta]) { 31 | $tb = (split(/:/, $tb))[$ii]; 32 | } 33 | print join("\t", @ta[0..8], '0/0', @ta[9..$#ta])."\n"; 34 | } 35 | 36 | -------------------------------------------------------------------------------- /reseq_tools/xpclr/chk_nonsyn.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | my %prev; 6 | my @lines; 7 | while (<>) { 8 | chomp; 9 | my @ta = split(/\t/, $_); 10 | $ta[0] eq 'MarkerID' and next; 11 | $ta[1] eq $ta[7] or die "Bad:$_\n"; 12 | push(@lines, [$_, '']); 13 | my %curr; 14 | $curr{'chrID'} = $ta[1]; 15 | $curr{'chrP'} = $ta[2]; 16 | $curr{'chrMend'} = $ta[8]; 17 | $curr{'chrLine'} = $_; 18 | 19 | if ( !(defined $prev{'chrID'}) or $prev{'chrID'} ne $curr{'chrID'} ) { 20 | %prev = %curr; 21 | next; 22 | } 23 | 24 | if ( $prev{'chrMend'} < $curr{'chrMend'} and $prev{'chrP'} > $curr{'chrP'} ) { 25 | $lines[-2][1] = 'Chk:'; 26 | $lines[-1][1] = 'Chk:'; 27 | } 28 | 29 | %prev = %curr; 30 | } 31 | for (@lines) { 32 | print "$_->[1]$_->[0]\n"; 33 | } 34 | -------------------------------------------------------------------------------- /reseq_tools/xpclr/get_uniq_cM.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | use fileSunhh; 6 | use mathSunhh; 7 | 8 | !@ARGV and die "perl $0 marker_loc2GM_man\n"; 9 | 10 | my $fn_gm = shift; 11 | 12 | my $fh_gm = &openFH($fn_gm, '<'); 13 | my %gmP_to_line; 14 | my %ord; 15 | while (&wantLineC($fh_gm)) { 16 | my @ta = &splitL("\t", $_); 17 | if ( $. == 0 and $ta[0] eq 'MarkerID' ) { 18 | print STDOUT "$_\n"; 19 | next; 20 | } 21 | my $tk = "$ta[7]_$ta[8]"; 22 | push(@{$gmP_to_line{$tk}}, [@ta]); 23 | $ord{$tk} //= $.; 24 | } 25 | close($fh_gm); 26 | 27 | for my $tk (sort { $ord{$a} <=> $ord{$b} } keys %gmP_to_line) { 28 | my $midN = int( $#{$gmP_to_line{$tk}}/2 ); 29 | print STDOUT join("\t", @{$gmP_to_line{$tk}[$midN]})."\n"; 30 | } 31 | 32 | -------------------------------------------------------------------------------- /reseq_tools/xpclr/plot_manhattan/chrLen: -------------------------------------------------------------------------------- 1 | WM97_Chr01 34083085 2 | WM97_Chr02 34414252 3 | WM97_Chr03 28939167 4 | WM97_Chr04 24315960 5 | WM97_Chr05 33714806 6 | WM97_Chr06 27018480 7 | WM97_Chr07 31477646 8 | WM97_Chr08 26149438 9 | WM97_Chr09 34986854 10 | WM97_Chr10 28419553 11 | WM97_Chr11 27106780 12 | -------------------------------------------------------------------------------- /reseq_tools/xpclr/plot_manhattan/chrLen_cum: -------------------------------------------------------------------------------- 1 | chrID chrLen chrCumS chrCumE 2 | WM97_Chr01 34083085 1 34083085 3 | WM97_Chr02 34414252 34183086 68597337 4 | WM97_Chr03 28939167 68697338 97636504 5 | WM97_Chr04 24315960 97736505 122052464 6 | WM97_Chr05 33714806 122152465 155867270 7 | WM97_Chr06 27018480 155967271 182985750 8 | WM97_Chr07 31477646 183085751 214563396 9 | WM97_Chr08 26149438 214663397 240812834 10 | WM97_Chr09 34986854 240912835 275899688 11 | WM97_Chr10 28419553 275999689 304419241 12 | WM97_Chr11 27106780 304519242 331626021 13 | -------------------------------------------------------------------------------- /reseq_tools/xpclr/xpclr_wind_cmd_wiGmP.pl: -------------------------------------------------------------------------------- 1 | #/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | use fileSunhh; 6 | 7 | !@ARGV and die "perl $0 wind_list grpList_1 grpList_2 work_dir\n"; 8 | 9 | my $oDir = pop(@ARGV); 10 | -e $oDir and die "exist $oDir\n"; 11 | mkdir($oDir); 12 | 13 | open F,'<',"$ARGV[0]" or die; 14 | my %tmp_cnt = ( 'cntN_base'=>0 , 'cntN_step'=>5e5 ); 15 | while () { 16 | &fileSunhh::log_section( $. , \%tmp_cnt ) and &tsmsg( "[Msg] $. line.\n" ); 17 | chomp; 18 | my @ta=split(/\t/, $_); 19 | $ta[1] =~ m/^chr(\d+)$/i or next; 20 | my $cn = $1; 21 | my $oWind = $ta[0]; 22 | $oWind =~ m!^.*/([^/]+)$! or die "bad $oWind\n"; 23 | $oWind = "$oDir/$1"; 24 | &fileSunhh::_copy( $ta[0], $oWind ); 25 | print ("perl prepare_xpclr_input_wiGmP.pl $oWind.xpclr $oWind $ARGV[1] $ARGV[2] ; XPCLR -xpclr $oWind.xpclr.chr${cn}_g1.geno $oWind.xpclr.chr${cn}_g2.geno $oWind.xpclr.chr${cn}.snp $oWind.xpclr.chr${cn}.snp.out -w1 0.0005 100 100 $cn -p0 0.7 ;\n"); 26 | } 27 | close F; 28 | 29 | -------------------------------------------------------------------------------- /rnaseq_tools/add_sizefactor.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use fileSunhh; 5 | 6 | !@ARGV and die "perl $0 in.total.sizefactor in.cnt\n"; 7 | 8 | open F1,'<',"$ARGV[0]" or die; 9 | my %h; 10 | while (&wantLineC(\*F1)) { 11 | my @ta=&splitL("\t", $_); 12 | $ta[0] eq 'sampleNames' and next; 13 | # $ta[0] =~ s/^\S+_([FP][13]_[^_]+_rep\d+)$/$1/ or die "$_\n"; 14 | $ta[0] =~ s/^\S+_([SM](?:FL|FR|LV|RT|SD|ST)(?:F1|P1|P3)_rep\d+)/$1/ or die "$ta[0]\n"; 15 | $h{$1} = $ta[1]; 16 | } 17 | close F1; 18 | 19 | open F2,'<',"$ARGV[1]" or die; 20 | while () { 21 | print "$_"; 22 | if ($. == 1) { 23 | chomp; 24 | my @ta = split(/\t/, $_); 25 | my @tb = ('sizeFactor'); 26 | for (my $i=1; $i<@ta; $i++) { 27 | # $ta[$i] =~ s/^\S+_([FP][13]_[^_]+_rep\d+)$/$1/ or die "$ta[$i]\n"; 28 | $ta[$i] =~ s/^\S+_([SM](?:FL|FR|LV|RT|SD|ST)(?:F1|P1|P3)_rep\d+)$/$1/ or die "$ta[$i]\n"; 29 | defined $h{$ta[$i]} or die "$ta[$i]\n"; 30 | $tb[$i] = $h{$ta[$i]}; 31 | } 32 | print join("\t", @tb)."\n"; 33 | } 34 | } 35 | close F2; 36 | 37 | -------------------------------------------------------------------------------- /rnaseq_tools/coexp/add_abs_toKME.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | while (<>) { 6 | chomp; 7 | my @ta=split(/\t/, $_); 8 | if ($. == 1) { 9 | print join("\t", @ta, "absKME")."\n"; 10 | } else { 11 | print join("\t", @ta, abs($ta[1]))."\n"; 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /rnaseq_tools/coexp/cmd_list: -------------------------------------------------------------------------------- 1 | ~/bin/Rscript dat1_wgcna.r 2 | -------------------------------------------------------------------------------- /rnaseq_tools/coexp/heatmap_by_mod_dist.r: -------------------------------------------------------------------------------- 1 | #!/home/Sunhh/bin/Rscript 2 | argvs <- commandArgs( trailingOnly=TRUE ); 3 | if ( is.na(argvs[1]) ) { 4 | message("Rscript this.R "); 5 | q(); 6 | } 7 | fn_simMat <- as.character(argvs[1]) ; # Out put of : perl dist_of_twoKME.pl sign redo/wgcna_dat1_signed/dat1_KME.txt unsi redo/wgcna_dat1_unsign/dat1_KME.txt > tt 8 | fn_outPdf <- as.character(argvs[2]) ; 9 | 10 | library(pheatmap) 11 | library(RColorBrewer) 12 | colors <- colorRampPalette( rev(brewer.pal(n= 9, name= "OrRd")) )(255) 13 | 14 | simMat <- read.table(fn_simMat, header=T, row.names=1, stringsAsFactors=F) 15 | distMat <- as.dist(1-simMat) 16 | distMat.mat <- as.matrix(distMat) 17 | 18 | pdf(file=fn_outPdf, width=8, height=8) 19 | pheatmap(distMat.mat, 20 | clustering_distance_rows = distMat , 21 | clustering_distance_cols = distMat , 22 | col = colors , 23 | show_colnames=TRUE) 24 | dev.off() 25 | 26 | -------------------------------------------------------------------------------- /rnaseq_tools/coexp/input/dat1_rpkmMean.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/rnaseq_tools/coexp/input/dat1_rpkmMean.gz -------------------------------------------------------------------------------- /rnaseq_tools/compare_SensAnti.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use fileSunhh; 5 | use LogInforSunhh; 6 | 7 | my $hh = <<"H1"; 8 | 9 | perl $0 S636_fixNH.bam.cntSens S636_fixNH.bam.cntAnti S636_fixNH.bam_pref > S636_fixNH.bam.SensByAnti 10 | 11 | H1 12 | 13 | @ARGV >= 3 or &LogInforSunhh::usage($hh); 14 | 15 | my $fn1 = shift; 16 | my $fn2 = shift; 17 | my $pref = shift; 18 | 19 | my ($sum1, $sum2); 20 | 21 | for (map { $_->[1] } grep { !($_->[0] =~ m!^__!)} &fileSunhh::load_tabFile($fn1)) { 22 | $sum1 += $_; 23 | } 24 | for (map { $_->[1] } grep { !($_->[0] =~ m!^__!)} &fileSunhh::load_tabFile($fn2)) { 25 | $sum2 += $_; 26 | } 27 | 28 | print STDOUT join("\t", qw/inPref sumSens sumAnti SensByAnti SensByTotl/)."\n"; 29 | my $r = ($sum2 > 0) ? sprintf("%0.4f", $sum1/$sum2) : -1 ; 30 | my $r2 = ($sum1 > 0 and $sum2 > 0) ? sprintf("%0.4f", $sum1/($sum1+$sum2)) : -1; 31 | print STDOUT join("\t", $pref, $sum1, $sum2, $r, $r2)."\n"; 32 | 33 | -------------------------------------------------------------------------------- /rnaseq_tools/fix_SRAfqID.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t and !@ARGV and die "perl $0 down_SRR_R2.fq > down_SRR_R2.fix.fq\n"; 6 | 7 | while (my $l1 = <>) { 8 | my $l2 = <>; 9 | my $l3 = <>; 10 | my $l4 = <>; 11 | if ($l1 =~ s!^\@\S+\.([12])\s+([^:]+:\d+:[^:]+(:\d+){4})(?:\s+.+|\s*)$!\@$2 $1!) { 12 | } elsif ($l1 =~ s!^\@(\S+)\.([12])(?:\s*|\s+length=\d+\s*)$!\@$1 $2!) { 13 | } 14 | $l3 =~ s!^\+.*$!+!; 15 | print "$l1$l2$l3$l4"; 16 | } 17 | -------------------------------------------------------------------------------- /rnaseq_tools/fix_excelV.pl: -------------------------------------------------------------------------------- 1 | #!/user/bin/perl 2 | use strict; 3 | use warnings; 4 | use fileSunhh; 5 | 6 | # Min value that I want to use in Excel is '9E-308'; 7 | -t and !@ARGV and die "perl $0 compList_inFruit_01.FDR > compList_inFruit_01.FDR_fixSmallV\n"; 8 | 9 | while (<>) { 10 | chomp; 11 | my @ta = &splitL("\t", $_); 12 | for my $tb (@ta) { 13 | if ($tb =~ m!^([+-]?)(\d+(?:\.\d+)?)e\-(\d+)$!i) { 14 | my ($str, $prevE, $afterE) = ($1, $2, $3); 15 | if ( $afterE > 308 ) { 16 | $tb = "9E-308"; 17 | } elsif ( $afterE == 308 and $prevE < 9 ) { 18 | $tb = "9E-308"; 19 | } else { 20 | } 21 | } 22 | } 23 | print STDOUT join("\t", @ta)."\n"; 24 | } 25 | 26 | -------------------------------------------------------------------------------- /rnaseq_tools/fromOthers/record: -------------------------------------------------------------------------------- 1 | run_TMM_scale_matrix.pl : This script comes from Trinity, and converts raw counts matrix (not nomalized) to TMM scaled. The input could also be TPM without normalization across samples. 2 | 3 | -------------------------------------------------------------------------------- /rnaseq_tools/get_DESeqNormCnt.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use Getopt::Long; 5 | my %opts; 6 | GetOptions(\%opts, 7 | "digits:i", # 2 8 | ); 9 | 10 | -t and !@ARGV and die "\nperl $0 P1g_perGene.sense.cnt.3_noSum_keep_wiSizeFact\n\n"; 11 | 12 | $opts{'digits'} //= 2; 13 | 14 | my @sf; 15 | while (<>) { 16 | chomp; 17 | my @ta = split(/\t/, $_); 18 | if ($. == 1) { 19 | print STDOUT "$_\n"; 20 | next; 21 | } 22 | if ($. == 2) { 23 | $ta[0] =~ m/sizeFactor/i or die "The second line should be sizeFactor\n"; 24 | @sf = @ta; 25 | next; 26 | } 27 | for (my $i=1; $i<@ta; $i++) { 28 | $ta[$i] = sprintf("%0.$opts{'digits'}f", $ta[$i]/$sf[$i]); 29 | } 30 | print STDOUT join("\t", @ta)."\n"; 31 | } 32 | 33 | -------------------------------------------------------------------------------- /rnaseq_tools/map_to_genome/cnvt_featureCounts_to_tpm.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Read command line arguments 4 | args <- commandArgs(trailingOnly = TRUE) 5 | 6 | if (length(args) != 2) { 7 | stop("Usage: Rscript calculate_tpm.R ") 8 | } 9 | 10 | input_file <- args[1] 11 | output_file <- args[2] 12 | 13 | # Read featureCounts output 14 | fc <- read.delim(input_file, comment.char = "#") 15 | 16 | # Remove summary rows if present (e.g., "__no_feature") 17 | fc <- fc[!grepl("^__", fc[,1]), ] 18 | 19 | # Extract gene lengths and raw counts 20 | gene_lengths <- fc$Length # column usually named 'Length' 21 | raw_counts <- fc[, 7:ncol(fc)] # columns after column 6 are sample counts 22 | 23 | # Calculate RPK 24 | rpk <- raw_counts / (gene_lengths / 1000) 25 | 26 | # Calculate TPM 27 | tpm <- apply(rpk, 2, function(x) x / sum(x) * 1e6) 28 | 29 | # Combine with gene ID 30 | tpm_df <- data.frame(GeneID = fc$Geneid, tpm) 31 | 32 | # Write to output 33 | write.table(tpm_df, file = output_file, sep = "\t", quote = FALSE, row.names = FALSE) 34 | -------------------------------------------------------------------------------- /rnaseq_tools/map_to_transcriptome/cnvt_synOGgrp_to_trans2gene.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | -t and !@ARGV and die "perl $0 data/slct-synOG.grp > data/slct-synOG.trans2gene.txt\n"; 6 | 7 | print "transcript_id\tgene_id\n"; 8 | while (<>) { 9 | chomp; 10 | my @ta=split(/\s|,/, $_); 11 | $ta[0] eq "OGID" and next; 12 | my ($grpID) = shift(@ta); 13 | for my $tb (@ta) { 14 | $tb =~ m!^\s*$! and next; 15 | print "$tb\t$grpID\n"; 16 | } 17 | } 18 | 19 | -------------------------------------------------------------------------------- /run_reapr.sh: -------------------------------------------------------------------------------- 1 | ### Basic functions. 2 | function exe_cmd { 3 | echo "[$(date)][CMD] $1" 4 | eval "$1" 5 | echo "[$(date)][Rec] Done." 6 | } 7 | 8 | function tsmsg { 9 | echo "[$(date)]$1" 10 | } 11 | 12 | 13 | exe_reapr='/home/Sunhh/src/Assemble/REAPR/Reapr_1.0.17/reapr' 14 | cpuN=10 15 | rd_ident=0.99 16 | 17 | in_fa='NSP306_Pla03s01GC_Gt5h.scf.fa' 18 | use_faPref='NSP306_Pla03s01GC_Gt5h' 19 | odir="${use_faPref}_15kb" 20 | 21 | #long_fq1=/home/Sunhh/Assembly/Sweetpotato/NSP306/FinalFq/NSP306_15kb_rc_sub100_R1.fq 22 | #long_fq2=/home/Sunhh/Assembly/Sweetpotato/NSP306/FinalFq/NSP306_15kb_rc_sub100_R2.fq 23 | long_fq1=/home/Sunhh/Assembly/Sweetpotato/NSP306/FinalFq/NSP306_15kb_pTr_rc_R1.fq 24 | long_fq2=/home/Sunhh/Assembly/Sweetpotato/NSP306/FinalFq/NSP306_15kb_pTr_rc_R2.fq 25 | fq_label=$odir 26 | 27 | shrtBamPref="" 28 | longBam="${fq_label}_long.bam" 29 | 30 | para_smalt="-n $cpuN -y $rd_ident" 31 | para_pipe="-break a=1 -score f=15" 32 | 33 | tsmsg "[Rec] Begin." 34 | 35 | cmd="$exe_reapr facheck $in_fa $use_faPref" 36 | exe_cmd $cmd 37 | 38 | cmd="$exe_reapr smaltmap $para_smalt ${use_faPref}.fa $long_fq1 $long_fq2 $longBam" 39 | exe_cmd $cmd 40 | 41 | cmd="$exe_reapr pipeline $para_pipe ${use_faPref}.fa $longBam $odir $shrtBamPref" 42 | exe_cmd $cmd 43 | 44 | tsmsg "[Rec] All done." 45 | -------------------------------------------------------------------------------- /sample_scripts/check_pm_version.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Copied from http://www.perlmonks.org/?node_id=37237 3 | 4 | use CPAN; 5 | 6 | printf("%-20s %10s %10s\n", "Module", "Installed", "CPAN"); 7 | 8 | foreach $a (@ARGV) { 9 | foreach $mod (CPAN::Shell->expand("Module", $a)){ 10 | printf("%-20s %10s %10s %s\n", 11 | $mod->id, 12 | $mod->inst_version eq "undef" || !defined($mod->inst_version) 13 | ? "-" : $mod->inst_version, 14 | $mod->cpan_version eq "undef" || !defined($mod->cpan_version) 15 | ? "-" : $mod->cpan_version, 16 | $mod->uptodate ? "" : "*" 17 | ); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /sample_scripts/cmd_list_trinity_denovo: -------------------------------------------------------------------------------- 1 | # https://github.com/trinityrnaseq/RNASeq_Trinity_Tuxedo_Workshop/wiki/Trinity-De-novo-Transcriptome-Assembly-Workshop 2 | /data/Sunhh/src/Assemble/trinity/trinityrnaseq-Trinity-v2.4.0/Trinity \ 3 | --seqType fq \ 4 | --max_memory 200G \ 5 | --left reads/Flower_R1.fq,reads/Fruit_R1.fq,reads/Leaf_R1.fq,reads/Root_R1.fq,reads/Stem_R1.fq \ 6 | --right reads/Flower_R2.fq,reads/Fruit_R2.fq,reads/Leaf_R2.fq,reads/Root_R2.fq,reads/Stem_R2.fq \ 7 | --KMER_SIZE 32 \ 8 | --CPU 24 \ 9 | --SS_lib_type RF \ 10 | --min_contig_length 200 \ 11 | --output trinity_novo01 \ 12 | --full_cleanup 13 | /data/Sunhh/src/Assemble/trinity/trinityrnaseq-Trinity-v2.4.0/Trinity \ 14 | --seqType fq \ 15 | --max_memory 200G \ 16 | --left reads/Flower_R1.fq,reads/Fruit_R1.fq,reads/Leaf_R1.fq,reads/Root_R1.fq,reads/Stem_R1.fq \ 17 | --right reads/Flower_R2.fq,reads/Fruit_R2.fq,reads/Leaf_R2.fq,reads/Root_R2.fq,reads/Stem_R2.fq \ 18 | --KMER_SIZE 32 \ 19 | --CPU 24 \ 20 | --SS_lib_type RF \ 21 | --min_contig_length 200 \ 22 | --normalize_max_read_cov 100 \ 23 | --output trinity_novo02 \ 24 | --full_cleanup 25 | -------------------------------------------------------------------------------- /sample_scripts/run_trinity_guided.sh: -------------------------------------------------------------------------------- 1 | /data/Sunhh/src/Trinity/trinityrnaseq-2.0.6/Trinity --single reads/C_6_GCCAAT_rmRRNA.fq,reads/C_7_CAGATC_rmRRNA.fq,reads/C_8_ACTTGA_rmRRNA.fq,reads/C_13_AGTCAA_rmRRNA.fq,reads/C_14_AGTTCC_rmRRNA.fq,reads/C_15_ATGTCA_rmRRNA.fq,reads/C_22_CGTACG_rmRRNA.fq,reads/C_23_GAGTGG_rmRRNA.fq,reads/C_24_GGTAGC_rmRRNA.fq,reads/C_31_CACGAT_rmRRNA.fq,reads/C_32_CACTCA_rmRRNA.fq,reads/C_33_CAGGCG_rmRRNA.fq,reads/C_40_TGACCA_rmRRNA.fq,reads/C_41_ACAGTG_rmRRNA.fq,reads/C_42_GCCAAT_rmRRNA.fq,reads/C_49_AGTCAA_rmRRNA.fq,reads/C_50_AGTTCC_rmRRNA.fq,reads/C_51_ATGTCA_rmRRNA.fq --seqType fq --max_memory 40G --SS_lib_type R --CPU 16 --normalize_reads --output P1_trinity_guided --genome_guided_bam 02.map2P1AllUnmsk_thout.accepted_hits.srt.bam --genome_guided_max_intron 100000 --genome_guided_min_coverage 2 2 | -------------------------------------------------------------------------------- /sample_scripts/svg2png.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | use Cwd; 4 | 5 | !@ARGV and die "perl $0 fdsaf\n"; 6 | 7 | my $origin_dir = getcwd(); 8 | 9 | for my $dn (`ls -d *_data`) { 10 | chomp($dn); 11 | print STDOUT "DIR=[$dn]\n"; 12 | -d $dn or next; 13 | chdir($dn); 14 | my $rilN = $dn; 15 | $rilN =~ s/_data$//; 16 | 17 | mkdir("PNG", 0755); 18 | mkdir("SVG", 0755); 19 | my @png_files; 20 | my @svg_files; 21 | for my $fn (`ls *.svg`) { 22 | chomp($fn); 23 | $fn =~ s/\.svg$//; 24 | system "convert $fn.svg $fn.png"; 25 | print "convert $fn.svg $fn.png\n"; 26 | push(@png_files, "$fn.png"); 27 | push(@svg_files, "$fn.svg"); 28 | } 29 | my $nn = scalar(@png_files); 30 | my $tt = $" ; 31 | local $" = " "; 32 | my $merge_cmd = "montage @png_files -tile 1x$nn -geometry -0-0 ${rilN}_Chroms.png"; 33 | my $mv_svg_cmd = "mv @svg_files SVG/"; 34 | my $mv_png_cmd = "mv @png_files PNG/"; 35 | $" = $tt; 36 | system("$merge_cmd"); 37 | print STDOUT "[Cmd]$merge_cmd\n"; 38 | system("$mv_svg_cmd"); 39 | print STDOUT "[Cmd]$mv_svg_cmd\n"; 40 | system("$mv_png_cmd"); 41 | print STDOUT "[Cmd]$mv_png_cmd\n"; 42 | chdir($origin_dir); 43 | print STDOUT "[Msg]$dn processing done.\n"; 44 | } 45 | 46 | -------------------------------------------------------------------------------- /self_interest/list_all_dir.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | my $pd0 = shift; 6 | my @all_sub = &list_sub($pd0); 7 | for (@all_sub) { 8 | print "$_\n"; 9 | } 10 | 11 | sub list_sub { 12 | # Skip links; 13 | my ($pd) = @_; 14 | # warn "pd=$pd\n"; 15 | my @subB = ($pd); 16 | if (! -d $pd ) { 17 | # $pd is not a folder; return $pd itself; 18 | return(@subB); 19 | } 20 | # $pd is a folder, get the children of $pd; 21 | opendir D0,$pd or die "failed to opendir [$pd]\n"; 22 | my @sub1 = map { "$pd/$_" } grep { $_ !~ m!^\.\.?$! } readdir(D0); 23 | push(@subB, @sub1); 24 | # There might be subdirs in @sub1, so check it. 25 | for my $sd1 (@sub1) { 26 | my @sub2 = &list_sub($sd1); 27 | push(@subB, @sub2[1..$#sub2]); 28 | } 29 | return(@subB); 30 | }# list_sub() 31 | 32 | 33 | -------------------------------------------------------------------------------- /site_search/cmd_list: -------------------------------------------------------------------------------- 1 | fimo --o fimoOut_allTrans --bfile --motif-- yyt_motif.meme Cmel351.annot.chr.gff3.jnLoc.YYTpromoter_loc.fa 2 | perl parse_fimoTSV.pl fimoOut_allTrans/fimo.tsv > fimoOut_allTrans/fimo.tsv.info 3 | perl setup_keySite.pl fimoOut_allTrans/fimo.tsv.info > fimoOut_allTrans/fimo.tsv.info.keysite 4 | 5 | perl pipe_gatk_inFqList.pl \ 6 | -cpuN 15 \ 7 | -conf_file BY9Hyyt_gatk.conf \ 8 | -in_pref_list pref_BY9H \ 9 | -prj_ID BY9Hyyt \ 10 | -wrk_dir proc_BY9Hyyt \ 11 | -doStep 1,2,3,4,5,6,7,8,9 \ 12 | -plCatVar -intervalLen 1000000 \ 13 | -CallByScf 14 | 15 | 16 | -------------------------------------------------------------------------------- /site_search/parse_fimoTSV.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | while (<>) { 6 | m!^\s*$! and next; 7 | m!^\s*#! and next; 8 | chomp; 9 | my @ta=split(/\t/, $_); 10 | if ($. == 1) { 11 | print join("\t", @ta, qw/mrnaID SeqID MotifStart MotifEnd MotifStr/)."\n"; 12 | next; 13 | } 14 | # Parse ID; 15 | $ta[2] =~ m!^(\S+):([^\s:]+):([+-]):(\d+):(\d+)$! or die "bad ID [$ta[2]]\n"; 16 | my ($mrnaID, $seqID, $str, $pS, $pE) = ($1, $2, $3, $4, $5); 17 | my ($seqS, $seqE, $seqStr); 18 | if ($str eq '+') { 19 | $seqS = $pS + $ta[3] - 1; 20 | $seqE = $pS + $ta[4] - 1; 21 | $seqStr = $str; 22 | } elsif ($str eq '-') { 23 | # $seqS = $pS + ($pE-$pS+1-$ta[3]); 24 | # $seqE = $pS + ($pE-$pS+1-$ta[4]); 25 | $seqE = $pE - $ta[3] + 1; 26 | $seqS = $pE - $ta[4] + 1; 27 | $seqStr = $str; 28 | } else { 29 | die "$_\n"; 30 | } 31 | if ($ta[5] eq '-') { 32 | $seqStr =~ tr/+-/-+/; 33 | } 34 | print join("\t", @ta, $mrnaID, $seqID, $seqS, $seqE, $seqStr)."\n"; 35 | } 36 | # motif_id motif_alt_id sequence_name start stop strand score p-value q-value matched_sequence 37 | # yytNOR MELO3C024429T1:Cmel351_Chr01:+:35115704:35118203 349 359 + 16.7447 8.7e-08 1 ACACGTCACCT 38 | # yytNOR MELO3C000679T1:Cmel351_Chr00:-:17643862:17646361 429 439 - 16.7447 8.7e-08 1 ACACGTCACCT 39 | -------------------------------------------------------------------------------- /site_search/yyt_motif.meme: -------------------------------------------------------------------------------- 1 | MEME version 4 2 | 3 | ALPHABET= ACGT 4 | 5 | strands: + - 6 | 7 | Background letter frequencies 8 | A 0.334 C 0.166 G 0.166 T 0.334 9 | 10 | 11 | MOTIF yytNOR 12 | letter-probability matrix: alength= 4 w= 11 13 | 0.683734 0.069873 0.127581 0.118812 14 | 0.003460 0.924404 0.049505 0.022631 15 | 1.000000 0.000000 0.000000 0.000000 16 | 0.000000 1.000000 0.000000 0.000000 17 | 0.000000 0.000000 1.000000 0.000000 18 | 0.025360 0.229520 0.057426 0.687694 19 | 0.530410 0.368034 0.015842 0.085714 20 | 0.635361 0.000000 0.000000 0.364639 21 | 0.100141 0.524752 0.083168 0.291938 22 | 0.057709 0.378501 0.057426 0.506365 23 | 0.212164 0.172843 0.128147 0.486846 24 | 25 | 26 | -------------------------------------------------------------------------------- /software_fix/anchorwave/fix_awMAF.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # [12/21/2022] Fix out.maf output from AnchorWave. 3 | ### http://genomewiki.ucsc.edu/index.php/Coordinate_Transforms 4 | use strict; 5 | use warnings; 6 | 7 | -t and !@ARGV and die "perl $0 AnchorWave_out_align1.maf > fixed.maf\n"; 8 | 9 | while (<>) { 10 | m!^#! and do { print; next; }; 11 | m!^a\s! or die "[Err] Unexpected format of MAF:$_\n"; 12 | my $s1=<>; 13 | my $s2=<>; 14 | my $blank=<>; 15 | $blank =~ m!^$! or die "[Err] Unexpected blank line: $blank\n"; 16 | $s1 = &fix_strPos($s1); 17 | $s2 = &fix_strPos($s2); 18 | print "$_$s1$s2$blank"; 19 | } 20 | 21 | sub fix_strPos { 22 | my ($ss) = @_; 23 | $ss =~ m!^s\s+\S+\s+(\d+)\s+(\d+)\s+([+-])\s+(\d+)! or die "[Err] Unexpected s line: $ss\n"; 24 | if ($3 eq '-') { 25 | my $newS = $4-($1+$2); 26 | $ss =~ s!^(s\s+\S+\s+)\d+(\s+)!$1$newS$2! or die "[Err] Failed to change: $ss\n"; 27 | } elsif ($3 eq '+') { 28 | ; 29 | } else { 30 | die "[Err] Impossible strand $3: $ss\n"; 31 | } 32 | return($ss); 33 | }# fix_strPos() 34 | 35 | -------------------------------------------------------------------------------- /temp/README.md: -------------------------------------------------------------------------------- 1 | # List of script functions. 2 | 3 | ## Convert figure formats. 4 | - Convert `pdf` format to `tiff` and `png`. 5 | 6 | ```sh 7 | perl scripts/cnvt_pdf_to_tiff.pl /path/to/input.pdf -out_format 'tiff,png' [ -out_dpi 300 ] 8 | ``` 9 | 10 | -------------------------------------------------------------------------------- /temp/cmd_list_busco: -------------------------------------------------------------------------------- 1 | # python3 BUSCO_v1.1b1/BUSCO_v1.1b1.py --cpu 20 -o spinach_unigene_0415_byBUSCO -in spinach_unigene_0415.fa -l /data/Sunhh/src/Annot/BUSCO/eukaryota -m trans 2 | python3 BUSCO_v1.1b1/BUSCO_v1.1b1.py --cpu 20 -o spinach_unigene_0415_byBUSCO_wiAug -sp SPGr2FiltUse_AED0 -in spinach_unigene_0415.fa -l /data/Sunhh/src/Annot/BUSCO/eukaryota -m trans 3 | 4 | -------------------------------------------------------------------------------- /temp/cmd_list_cegma: -------------------------------------------------------------------------------- 1 | cegma -g spinach_unigene_0415.fa -o spinach_unigene_0415 --max_intron 0 -T 20 --verbose 2 | 3 | -------------------------------------------------------------------------------- /temp/cnvt_pairwise_to_tab.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use Bio::SearchIO; 5 | 6 | !@ARGV and die "perl $0 in.blast > out.tab\n"; 7 | 8 | my $inBlastFile = shift; 9 | 10 | my $in = new Bio::SearchIO( 11 | -format => 'blast', 12 | -file => $inBlastFile 13 | ); 14 | 15 | while( my $result = $in->next_result ) { 16 | ## $result is a Bio::Search::Result::ResultI compliant object 17 | while( my $hit = $result->next_hit ) { 18 | ## $hit is a Bio::Search::Hit::HitI compliant object 19 | while( my $hsp = $hit->next_hsp ) { 20 | ## $hsp is a Bio::Search::HSP::HSPI compliant object 21 | print STDOUT join("\t", $result->query_name(), $hit->name(), $hit->description(), $hsp->score(), $hsp->evalue())."\n"; 22 | # if( $hsp->length('total') > 50 ) { 23 | # if ( $hsp->percent_identity >= 75 ) { 24 | # print "Query=", $result->query_name, 25 | # " Hit=", $hit->name, 26 | # " Length=", $hsp->length('total'), 27 | # " Percent_id=", $hsp->percent_identity, "\n"; 28 | # } 29 | # } 30 | } 31 | } 32 | } 33 | 34 | -------------------------------------------------------------------------------- /temp/forRonan/addID_to_loci.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | # Add new ID to .loc table. 5 | # Format of .loc : ID \\t Start \\t End \\n 6 | # New format : : ID \\t Start \\t End \\t Pref_ID_Start_End \\n 7 | 8 | !@ARGV and die "perl $0 pref in_raw.loc\n"; 9 | 10 | my $pref = shift; 11 | 12 | my %used; 13 | while (<>) { 14 | chomp; 15 | my @ta = split(/\t/, $_); 16 | my $new_id = "${pref}_$ta[0]_$ta[1]_$ta[2]"; 17 | defined $used{$new_id} and die "repeat new id=$new_id\n"; 18 | $used{$new_id} = 1; 19 | print STDOUT join("\t", @ta[0,1,2], $new_id)."\n"; 20 | } 21 | 22 | -------------------------------------------------------------------------------- /temp/forRonan/depC_cutoff_by_dep_stat.pl: -------------------------------------------------------------------------------- 1 | #!/bin/env perl 2 | use strict; 3 | use warnings; 4 | 5 | my ($interval_mean, $interval_stdev); 6 | while (<>) { 7 | chomp; 8 | m!^interval_mean\t([\+\-\d.]+)$! and $interval_mean = $1; 9 | m!^interval_stdev\t([\+\-\d.]+)$! and $interval_stdev = $1; 10 | } 11 | print "INS_mean=$interval_mean\n"; 12 | print "INS_stdev=$interval_stdev\n"; 13 | print "INS_cutoff=" . ($interval_mean+3*$interval_stdev) . "\n"; 14 | 15 | -------------------------------------------------------------------------------- /temp/forRonan/filter_sam.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 2015-07-14 For ronan's internship project. 3 | # Filter parent_Rd to parent_Asm alignments 4 | # 1. Only unique alignments kept. 5 | # 2. Only 100% match alignments kept. 6 | # 3. Only mapping quality >= 1 kept. 7 | # After filtering, only alignments exactly the same as the reference are kept, which can be used to call same region between Rd_reads and Asm_reference. 8 | use strict; 9 | use warnings; 10 | use LogInforSunhh; 11 | use SeqAlnSunhh; 12 | 13 | -t and !@ARGV and die "perl $0 in_rd2Asm.sam\n"; 14 | 15 | while (<>) { 16 | m!^\@! and do { print; next; }; 17 | m!\t(?:XT:A:U|NM:i:0)(?:\t|$)! or next; 18 | my @ta = split(/\t/, $_); 19 | $ta[4] >= 1 or next; 20 | print ; 21 | } 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /temp/reformat_tabHit.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | while (<>) { 6 | chomp; 7 | my @ta = split(/\t/, $_); 8 | $ta[2] =~ s!^(\S+) !! or die "$_\n"; 9 | $ta[1] = $1; 10 | print STDOUT join("\t", @ta)."\n"; 11 | } 12 | -------------------------------------------------------------------------------- /temp/replace_unicode.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # [5/10/2022] Replace unicode characters to space. At first I need to provide a list of good characters by "perl -e 'while (<>) { split(//, $_);}'" 3 | use strict; 4 | use warnings; 5 | 6 | !@ARGV and die "perl $0 good_char.list input.file > input.replaced\n"; 7 | 8 | my $f1 = shift; 9 | open F1,'<',"$f1" or die; 10 | my %h; 11 | while () { 12 | chomp; 13 | $_ = "AA${_}AA"; 14 | my @ta=split(/\t/, $_); 15 | $ta[0] =~ s!^AA!!; 16 | $ta[-1] =~ s!AA$!!; 17 | $h{$ta[0]} = $ta[0]; 18 | } 19 | close F1; 20 | $h{"\t"} = "\t"; 21 | 22 | while (<>) { 23 | chomp; 24 | my @ta=split(//, $_); 25 | for my $tb (@ta) { 26 | defined $h{$tb} or $tb = " "; 27 | } 28 | print join("", @ta)."\n"; 29 | } 30 | 31 | -------------------------------------------------------------------------------- /temp/rm_gff_byLis.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | !@ARGV and die "perl $0 rm_list in_gff\n"; 6 | 7 | my $lisF =shift; 8 | my $gffF = shift; 9 | open LF,'<',"$lisF" or die; 10 | my %rmid; 11 | while () { 12 | chomp; 13 | m/^\s*(#|$)/ and next; 14 | my @ta = split(/\t/, $_); 15 | $rmid{$ta[0]} = 1; 16 | } 17 | close LF; 18 | open GF,'<',"$gffF" or die; 19 | while () { 20 | chomp; 21 | if ( m/^\s*(#|$)/ ) { 22 | print "$_\n"; 23 | next; 24 | } 25 | my @ta = split(/\t/, $_); 26 | if ($ta[8] =~ m/(?:^|;|\s)ID=([^\s;]+)/i) { 27 | defined $rmid{$1} and next; 28 | } 29 | if ($ta[8] =~ m/(?:^|;|\s)Parent=([^\s;]+)/i) { 30 | defined $rmid{$1} and next; 31 | } 32 | print "$_\n"; 33 | } 34 | close GF; 35 | -------------------------------------------------------------------------------- /temp/scripts/cnvt_pdf_to_tiff.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use LogInforSunhh; 5 | use Getopt::Long; 6 | my %opts; 7 | GetOptions(\%opts, 8 | "out_format:s", # tiff. 9 | "out_dpi:i", # 300; 10 | "help!" 11 | ); 12 | 13 | $opts{'out_format'} //= 'tiff'; 14 | $opts{'out_dpi'} //= 300; 15 | 16 | my $help_txt = <) { 12 | chomp; 13 | m/^\s*(#|$)/ and next; 14 | my @ta = split(/\t/, $_); 15 | $rmid{$ta[0]} = 1; 16 | } 17 | close LF; 18 | open GF,'<',"$gffF" or die; 19 | while () { 20 | chomp; 21 | if ( m/^\s*(#|$)/ ) { 22 | print "$_\n"; 23 | next; 24 | } 25 | my @ta = split(/\t/, $_); 26 | my $is_o = 0; 27 | if ($ta[8] =~ m/(?:^|;|\s)ID=([^\s;]+)/i) { 28 | defined $rmid{$1} and $is_o = 1; 29 | } 30 | if ($ta[8] =~ m/(?:^|;|\s)Parent=([^\s;]+)/i) { 31 | defined $rmid{$1} and $is_o = 1; 32 | } 33 | $is_o == 1 and print "$_\n"; 34 | } 35 | close GF; 36 | -------------------------------------------------------------------------------- /temp/temp_fix_gff3/Grif_1614.fix.gff3.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/temp/temp_fix_gff3/Grif_1614.fix.gff3.gz -------------------------------------------------------------------------------- /temp/temp_fix_gff3/Grif_1614.gff3.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/temp/temp_fix_gff3/Grif_1614.gff3.gz -------------------------------------------------------------------------------- /temp/temp_process_ONT/cdna_classifier_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/temp/temp_process_ONT/cdna_classifier_report.pdf -------------------------------------------------------------------------------- /temp/temp_process_ONT/cmd_list: -------------------------------------------------------------------------------- 1 | # https://github.com/ksahlin/isONcorrect#Using-conda 2 | conda create -n isoncorrect python=3 pip 3 | conda activate isoncorrect 4 | pip install isONcorrect 5 | conda install -c bioconda spoa 6 | # isONcorrect --help 7 | pip install isONclust 8 | conda install -c bioconda "hmmer>=3.0" 9 | conda install -c bioconda "pychopper>=2.0" 10 | 11 | # 12 | nohup bash correction_pipeline.sh /data/Sunhh/temp/ont/reads/ERR3588903_1.fastq /data/Sunhh/temp/ont/out_SIRV/ 60 > scrn.SIRV_test 13 | 14 | nohup bash correction_pipeline.sh /data/wushan/Cnr_RNA_ONT/01.err_correction/01.raw_reads/cnr_breaker.fastq /data/Sunhh/temp/ont/o_cnr_breaker/ 60 > scrn.o_cnr_breaker 15 | 16 | 17 | 18 | ## 19 | run_isoncorrect --t 40 --fastq_folder ./01.raw/ --outfolder ./02.corrected/ --split_wrt_batches 20 | 21 | 22 | -------------------------------------------------------------------------------- /temp/temp_process_ONT/raw_rdN.tbl: -------------------------------------------------------------------------------- 1 | InFile Total_size Total_Rd_num Mean_Rd_size Range_Rd_size PhredCut Time 2 | reads/ERR3588903_1.fastq.gz 1418512089 1680000 844.352433928571 83-5919 Phred33 Mon Dec 19 11:47:05 2022 3 | -------------------------------------------------------------------------------- /temp/temp_process_ONT/test_SIRV/cdna_classifier_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/temp/temp_process_ONT/test_SIRV/cdna_classifier_report.pdf -------------------------------------------------------------------------------- /temp/temp_process_ONT/test_SIRV/cmd_list: -------------------------------------------------------------------------------- 1 | # https://github.com/ksahlin/isONcorrect#Using-conda 2 | conda create -n isoncorrect python=3 pip 3 | conda activate isoncorrect 4 | pip install isONcorrect 5 | conda install -c bioconda spoa 6 | # isONcorrect --help 7 | pip install isONclust 8 | conda install -c bioconda "hmmer>=3.0" 9 | conda install -c bioconda "pychopper>=2.0" 10 | 11 | 12 | # wget https://raw.githubusercontent.com/ksahlin/isONcorrect/master/test_data/isoncorrect/0.fastq 13 | # wget https://raw.githubusercontent.com/ksahlin/isONcorrect/master/scripts/correction_pipeline.sh 14 | 15 | # 16 | nohup bash correction_pipeline.sh /data/Sunhh/temp/ont/reads/ERR3588903_1.fastq /data/Sunhh/temp/ont/out_SIRV/ 60 > scrn.SIRV_test 17 | 18 | nohup bash correction_pipeline.sh /data/wushan/Cnr_RNA_ONT/01.err_correction/01.raw_reads/cnr_breaker.fastq /data/Sunhh/temp/ont/o_cnr_breaker/ 60 > scrn.o_cnr_breaker 19 | 20 | 21 | 22 | ## 23 | run_isoncorrect --t 40 --fastq_folder ./01.raw/ --outfolder ./02.corrected/ --split_wrt_batches 24 | 25 | 26 | -------------------------------------------------------------------------------- /temp/temp_process_ONT/test_SIRV/scrn.SIRV_test.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sunhh/NGS_data_processing/7c202e9c150708c32c2d2a852bb7925adceb70f7/temp/temp_process_ONT/test_SIRV/scrn.SIRV_test.gz -------------------------------------------------------------------------------- /temp/temp_process_ONT/test_SIRV/scrn.corr: -------------------------------------------------------------------------------- 1 | Usage: correction_pipeline.sh 2 | -------------------------------------------------------------------------------- /temp/temp_process_ONT/test_small/cmd_list: -------------------------------------------------------------------------------- 1 | isONcorrect --fastq 0.fastq --outfolder out 2 | --------------------------------------------------------------------------------