├── test_public ├── check_uTR │ ├── test.sh │ ├── acc │ ├── main.o │ ├── Makefile │ └── main.c ├── gendata │ ├── gen │ ├── main.o │ ├── Makefile │ ├── main.c │ └── MT.h ├── checkTRF │ ├── main.o │ ├── checkTRF │ ├── Makefile │ └── main.c ├── tmp │ ├── accuracy_time.xlsx │ ├── time_RM.txt │ ├── time_TRF.txt │ ├── accuracy_TRF_allowance0.txt │ ├── accuracy_RM_allowance0.txt │ ├── accuracy_TRF_allowance0.01.txt │ ├── accuracy_TRF_allowance0.02.txt │ ├── accuracy_TRF_allowance0.03.txt │ ├── accuracy_RM_allowance0.01.txt │ ├── accuracy_RM_allowance0.02.txt │ ├── accuracy_RM_allowance0.03.txt │ ├── accuracy_uTR.txt │ ├── accuracy_uTR_allowance0.txt │ ├── accuracy_uTR_allowance0.01.txt │ ├── accuracy_uTR_allowance0.02.txt │ ├── accuracy_uTR_allowance0.03.txt │ └── time_uTR.txt ├── check_RepeatMasker │ ├── checkRM │ ├── main.o │ ├── Makefile │ └── main.c ├── parse_RepeatMasker │ ├── main.o │ ├── parseRM │ ├── Makefile │ └── main.c ├── make.sh ├── test.sh ├── test_allowance.sh ├── README.md └── test_gendata_decompose.sh ├── nsop_test ├── gendata │ ├── gen │ ├── main.o │ ├── Makefile │ ├── main.c │ └── MT.h ├── nsop_compression_ratio.xlsx ├── README.md ├── uTR │ ├── Makefile │ ├── main.c │ ├── uTR.h │ ├── MT.h │ ├── SAIS.c │ ├── coverage_by_units.c │ └── handle_one_file.c ├── test.sh └── nsop_compression.csv ├── realdata ├── test.sh ├── README.md ├── realdata.fasta └── realdata_result.fasta ├── Makefile ├── LICENSE.txt ├── smooth.c ├── Kawahara_nsop_Z.cpp ├── wrap_around_DP.c ├── MT.h ├── SAIS.c ├── handle_one_file.c ├── uTR.h ├── README.md └── coverage_by_long_units_nsop_Z.cpp /test_public/check_uTR/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./acc sample.fasta 4 | -------------------------------------------------------------------------------- /nsop_test/gendata/gen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morisUtokyo/uTR/HEAD/nsop_test/gendata/gen -------------------------------------------------------------------------------- /test_public/gendata/gen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morisUtokyo/uTR/HEAD/test_public/gendata/gen -------------------------------------------------------------------------------- /nsop_test/gendata/main.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morisUtokyo/uTR/HEAD/nsop_test/gendata/main.o -------------------------------------------------------------------------------- /test_public/check_uTR/acc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morisUtokyo/uTR/HEAD/test_public/check_uTR/acc -------------------------------------------------------------------------------- /test_public/gendata/main.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morisUtokyo/uTR/HEAD/test_public/gendata/main.o -------------------------------------------------------------------------------- /test_public/checkTRF/main.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morisUtokyo/uTR/HEAD/test_public/checkTRF/main.o -------------------------------------------------------------------------------- /test_public/check_uTR/main.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morisUtokyo/uTR/HEAD/test_public/check_uTR/main.o -------------------------------------------------------------------------------- /realdata/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ../uTR -f realdata.fasta -o realdata_result.fasta -sda 4 | 5 | exit 0 6 | -------------------------------------------------------------------------------- /test_public/checkTRF/checkTRF: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morisUtokyo/uTR/HEAD/test_public/checkTRF/checkTRF -------------------------------------------------------------------------------- /test_public/tmp/accuracy_time.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morisUtokyo/uTR/HEAD/test_public/tmp/accuracy_time.xlsx -------------------------------------------------------------------------------- /nsop_test/nsop_compression_ratio.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morisUtokyo/uTR/HEAD/nsop_test/nsop_compression_ratio.xlsx -------------------------------------------------------------------------------- /test_public/check_RepeatMasker/checkRM: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morisUtokyo/uTR/HEAD/test_public/check_RepeatMasker/checkRM -------------------------------------------------------------------------------- /test_public/check_RepeatMasker/main.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morisUtokyo/uTR/HEAD/test_public/check_RepeatMasker/main.o -------------------------------------------------------------------------------- /test_public/parse_RepeatMasker/main.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morisUtokyo/uTR/HEAD/test_public/parse_RepeatMasker/main.o -------------------------------------------------------------------------------- /test_public/parse_RepeatMasker/parseRM: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morisUtokyo/uTR/HEAD/test_public/parse_RepeatMasker/parseRM -------------------------------------------------------------------------------- /test_public/make.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd check_uTR; make clean; make; 4 | cd ../gendata; make clean; make; 5 | cd ../parse_RepeatMasker; make clean; make; 6 | cd ../check_RepeatMasker; make clean; make; 7 | cd ../checkTRF; make clean; make; 8 | cd .. 9 | 10 | exit 0 11 | -------------------------------------------------------------------------------- /nsop_test/README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | To calculate the compression ratios of typical fasta files with 1000 mosaic tandem repeats, issue: 4 | 5 | bash test.sh 6 | 7 | It generates two executable modules in the directories named uTR and gendata. Afterwards, it starts computing the compression ratios of fasta files. 8 | 9 | 10 | -------------------------------------------------------------------------------- /nsop_test/gendata/Makefile: -------------------------------------------------------------------------------- 1 | PROGRAM = gen 2 | OBJS = main.o 3 | CC = gcc 4 | CPP = g++ 5 | CFLAGS = -std=c99 -fPIC -fcommon 6 | 7 | .cpp.o: 8 | $(CPP) -c $< 9 | .c.o: 10 | $(CC) $(CFLAGS) -c $< 11 | 12 | # g++ must be used to link libraries required 13 | $(PROGRAM): $(OBJS) 14 | $(CPP) $(OBJS) -o $(PROGRAM) 15 | clean: 16 | rm $(PROGRAM) $(OBJS) 17 | -------------------------------------------------------------------------------- /test_public/gendata/Makefile: -------------------------------------------------------------------------------- 1 | PROGRAM = gen 2 | OBJS = main.o 3 | CC = gcc 4 | CPP = g++ 5 | CFLAGS = -std=c99 -fPIC -fcommon 6 | 7 | .cpp.o: 8 | $(CPP) -c $< 9 | .c.o: 10 | $(CC) $(CFLAGS) -c $< 11 | 12 | # g++ must be used to link libraries required 13 | $(PROGRAM): $(OBJS) 14 | $(CPP) $(OBJS) -o $(PROGRAM) 15 | clean: 16 | rm $(PROGRAM) $(OBJS) 17 | -------------------------------------------------------------------------------- /test_public/check_uTR/Makefile: -------------------------------------------------------------------------------- 1 | PROGRAM = acc 2 | OBJS = main.o 3 | CC = gcc 4 | CPP = g++ 5 | CFLAGS = -std=c99 -fPIC -fcommon 6 | 7 | .cpp.o: 8 | $(CPP) -c $< 9 | .c.o: 10 | $(CC) $(CFLAGS) -c $< 11 | 12 | # g++ must be used to link libraries required 13 | $(PROGRAM): $(OBJS) 14 | $(CPP) $(OBJS) -o $(PROGRAM) 15 | clean: 16 | rm $(PROGRAM) $(OBJS) 17 | -------------------------------------------------------------------------------- /test_public/checkTRF/Makefile: -------------------------------------------------------------------------------- 1 | PROGRAM = checkTRF 2 | OBJS = main.o 3 | CC = gcc 4 | CPP = g++ 5 | CFLAGS = -std=c99 -fPIC -fcommon 6 | 7 | .cpp.o: 8 | $(CPP) -c $< 9 | .c.o: 10 | $(CC) $(CFLAGS) -c $< 11 | 12 | # g++ must be used to link libraries required 13 | $(PROGRAM): $(OBJS) 14 | $(CPP) $(OBJS) -o $(PROGRAM) 15 | clean: 16 | rm $(PROGRAM) $(OBJS) 17 | -------------------------------------------------------------------------------- /test_public/check_RepeatMasker/Makefile: -------------------------------------------------------------------------------- 1 | PROGRAM = checkRM 2 | OBJS = main.o 3 | CC = gcc 4 | CPP = g++ 5 | CFLAGS = -std=c99 -fPIC -fcommon 6 | 7 | .cpp.o: 8 | $(CPP) -c $< 9 | .c.o: 10 | $(CC) $(CFLAGS) -c $< 11 | 12 | # g++ must be used to link libraries required 13 | $(PROGRAM): $(OBJS) 14 | $(CPP) $(OBJS) -o $(PROGRAM) 15 | clean: 16 | rm $(PROGRAM) $(OBJS) 17 | -------------------------------------------------------------------------------- /test_public/parse_RepeatMasker/Makefile: -------------------------------------------------------------------------------- 1 | PROGRAM = parseRM 2 | OBJS = main.o 3 | CC = gcc 4 | CPP = g++ 5 | CFLAGS = -std=c99 -fPIC -fcommon 6 | 7 | .cpp.o: 8 | $(CPP) -c $< 9 | .c.o: 10 | $(CC) $(CFLAGS) -c $< 11 | 12 | # g++ must be used to link libraries required 13 | $(PROGRAM): $(OBJS) 14 | $(CPP) $(OBJS) -o $(PROGRAM) 15 | clean: 16 | rm $(PROGRAM) $(OBJS) 17 | -------------------------------------------------------------------------------- /test_public/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # generate benchmark data and decompose them into mosaic tandem repeats 4 | bash test_gendata_decompose.sh 5 | 6 | # Compute the accuracy of predicting mosaic tandem repeats, say U_i V_j W_k with allowing the values of i, j, and k can differ from the true values at most 1%, 2%, and 3%. 7 | bash test_allowance.sh 8 | 9 | exit 0 10 | -------------------------------------------------------------------------------- /nsop_test/uTR/Makefile: -------------------------------------------------------------------------------- 1 | PROGRAM = uTR 2 | OBJS = main.o handle_one_file.o SAIS.o coverage_by_units.o units.o coverage_by_long_units_nsop_Z.o 3 | CC = gcc 4 | CPP = g++ 5 | CFLAGS = -std=c99 -fPIC -fcommon 6 | 7 | .cpp.o: 8 | $(CPP) -c $< 9 | .c.o: 10 | $(CC) $(CFLAGS) -c $< 11 | 12 | # g++ must be used to link libraries required 13 | $(PROGRAM): $(OBJS) 14 | $(CPP) $(OBJS) -o $(PROGRAM) 15 | clean: 16 | rm $(PROGRAM) $(OBJS) 17 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PROGRAM = uTR 2 | OBJS = main.o handle_one_file.o SAIS.o coverage_by_units.o units.o coverage_by_long_units_nsop_Z.o string_decomposer.o smooth.o wrap_around_DP.o 3 | CC = gcc 4 | CPP = g++ 5 | CFLAGS = -std=c99 -fPIC -fcommon 6 | 7 | .cpp.o: 8 | $(CPP) -c $< 9 | .c.o: 10 | $(CC) $(CFLAGS) -c $< 11 | 12 | # g++ must be used to link libraries required 13 | $(PROGRAM): $(OBJS) 14 | $(CPP) $(OBJS) -o $(PROGRAM) 15 | clean: 16 | rm $(PROGRAM) $(OBJS) 17 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | GNU General Public License version 3 2 | Copyright (C) 2021- Shinichi Morishita, University of Tokyo 3 | 4 | This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. 5 | 6 | This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 7 | 8 | You should have received a copy of the GNU General Public License along with this program. If not, see . 9 | -------------------------------------------------------------------------------- /realdata/README.md: -------------------------------------------------------------------------------- 1 | ## Real data 2 | 3 | The fasta file includes the following DNA sequences: 4 | 5 | - SAND12(control,BAFME) hg38_dna range=chr8:118366813-118366928 in the 4th intron of SAMD12, pattern=(AAAAT)23 6 | 7 | - SAND12(case,BAFME) Tandem repeat in the 4th intron of SAND12 found in Patient II-1 in family F6115 (Supplementary Figure 6, Ishiura, H. et al. Expansions of intronic TTTCA and TTTTA repeats in benign adult familial myoclonic epilepsy. Nat Genet 50, 581–590 (2018)) pattern=(ATTTT)221(ATTTC)221(ATTTT)82 8 | 9 | - SAND12(case,BAFME) Tandem repeat in the 4th intron of SAND12 found in (Supplementary Figure 6, Ishiura, H. et al. Expansions of intronic TTTCA and TTTTA repeats in benign adult familial myoclonic epilepsy. Nat Genet 50, 581–590 (2018)) pattern=(ATTTT)613(ATTTC)320(ATTTT)5(ATTTC)130 10 | 11 | - RFC1(control,CANVAS) hg38_dna range=chr4:39348425-39348483 pattern=(AAAAG)11 12 | 13 | - KAZN(control) hg38_dna range=chr1:14883297-14883426 pattern=(AAAG)6(AG)11(AAAG)20 14 | 15 | - ZNF37A(control) hg38_dna range=chr10:38112731-38112826 pattern=(CTTTT)12(CTTGT)3(CTTTT)2 16 | 17 | To apply uTR, use 18 | 19 | bash test.sh 20 | -------------------------------------------------------------------------------- /nsop_test/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "----------------------------" 4 | echo "Compile gendata" 5 | cd gendata 6 | make clean; make 7 | 8 | echo "----------------------------" 9 | echo "Compile uTR" 10 | cd ../uTR 11 | make clean; make 12 | cd .. 13 | 14 | echo "----------------------------" 15 | echo "Calculate compression ratio" 16 | gen_units=gendata/gen 17 | uTR=uTR/uTR 18 | 19 | num_TRs=1000 20 | min_unit_occ=10 21 | max_unit_occ=200 22 | 23 | result="nsop_compression.csv" 24 | rm $result 25 | 26 | 27 | listError=(0.0 0.01 0.03 0.05 0.1 0.15) 28 | listPattern=("AC_AG" "ACC_GTT" "AAG_AG" "AAG_AGG" "AAAG_AG" "AAAG_AG_AAAG" "AAAG_AG_AGGG_AG_AAAG" "AGGGG_AAAAGAAAGAGAGGG_AGGGG") 29 | 30 | for error_ratio in ${listError[@]} 31 | do 32 | 33 | for units_name in ${listPattern[@]} 34 | do 35 | units=${units_name//\_/ } 36 | run_name=$units_name"_"$min_unit_occ"_"$max_unit_occ"_"$error_ratio 37 | 38 | # Generate data of tandem repeats for the given pattern 39 | TR_file=$run_name".fasta" 40 | res="${units_name//[^_]}" 41 | numUnits=$(( ${#res}+1 )) 42 | 43 | $gen_units -k $min_unit_occ -l $max_unit_occ -n $num_TRs -e $error_ratio -m $numUnits $units > $TR_file 44 | 45 | echo -n $run_name >> $result 46 | $uTR -f $TR_file >> $result 47 | rm $TR_file 48 | 49 | done 50 | done 51 | 52 | exit 0 53 | -------------------------------------------------------------------------------- /test_public/tmp/time_RM.txt: -------------------------------------------------------------------------------- 1 | AC_AG_10_200_0.0 23 2 | ACC_GTT_10_200_0.0 19 3 | AAG_AG_10_200_0.0 19 4 | AAG_AGG_10_200_0.0 19 5 | AAAG_AG_10_200_0.0 21 6 | AAAG_AG_AAAG_10_200_0.0 31 7 | AAAG_AG_AGGG_AG_AAAG_10_200_0.0 89 8 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.0 79 9 | AC_AG_10_200_0.01 24 10 | ACC_GTT_10_200_0.01 25 11 | AAG_AG_10_200_0.01 25 12 | AAG_AGG_10_200_0.01 25 13 | AAAG_AG_10_200_0.01 28 14 | AAAG_AG_AAAG_10_200_0.01 53 15 | AAAG_AG_AGGG_AG_AAAG_10_200_0.01 106 16 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.01 159 17 | AC_AG_10_200_0.03 25 18 | ACC_GTT_10_200_0.03 26 19 | AAG_AG_10_200_0.03 28 20 | AAG_AGG_10_200_0.03 31 21 | AAAG_AG_10_200_0.03 32 22 | AAAG_AG_AAAG_10_200_0.03 66 23 | AAAG_AG_AGGG_AG_AAAG_10_200_0.03 122 24 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.03 190 25 | AC_AG_10_200_0.05 26 26 | ACC_GTT_10_200_0.05 31 27 | AAG_AG_10_200_0.05 37 28 | AAG_AGG_10_200_0.05 37 29 | AAAG_AG_10_200_0.05 40 30 | AAAG_AG_AAAG_10_200_0.05 74 31 | AAAG_AG_AGGG_AG_AAAG_10_200_0.05 132 32 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.05 215 33 | AC_AG_10_200_0.1 30 34 | ACC_GTT_10_200_0.1 25 35 | AAG_AG_10_200_0.1 51 36 | AAG_AGG_10_200_0.1 60 37 | AAAG_AG_10_200_0.1 60 38 | AAAG_AG_AAAG_10_200_0.1 104 39 | AAAG_AG_AGGG_AG_AAAG_10_200_0.1 186 40 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.1 305 41 | AC_AG_10_200_0.15 29 42 | ACC_GTT_10_200_0.15 24 43 | AAG_AG_10_200_0.15 44 44 | AAG_AGG_10_200_0.15 56 45 | AAAG_AG_10_200_0.15 57 46 | AAAG_AG_AAAG_10_200_0.15 112 47 | AAAG_AG_AGGG_AG_AAAG_10_200_0.15 178 48 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.15 329 49 | -------------------------------------------------------------------------------- /test_public/tmp/time_TRF.txt: -------------------------------------------------------------------------------- 1 | AC_AG_10_200_0.0 14 2 | ACC_GTT_10_200_0.0 18 3 | AAG_AG_10_200_0.0 18 4 | AAG_AGG_10_200_0.0 21 5 | AAAG_AG_10_200_0.0 20 6 | AAAG_AG_AAAG_10_200_0.0 55 7 | AAAG_AG_AGGG_AG_AAAG_10_200_0.0 201 8 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.0 147 9 | AC_AG_10_200_0.01 19 10 | ACC_GTT_10_200_0.01 34 11 | AAG_AG_10_200_0.01 32 12 | AAG_AGG_10_200_0.01 38 13 | AAAG_AG_10_200_0.01 41 14 | AAAG_AG_AAAG_10_200_0.01 124 15 | AAAG_AG_AGGG_AG_AAAG_10_200_0.01 197 16 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.01 313 17 | AC_AG_10_200_0.03 23 18 | ACC_GTT_10_200_0.03 45 19 | AAG_AG_10_200_0.03 42 20 | AAG_AGG_10_200_0.03 52 21 | AAAG_AG_10_200_0.03 54 22 | AAAG_AG_AAAG_10_200_0.03 146 23 | AAAG_AG_AGGG_AG_AAAG_10_200_0.03 222 24 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.03 411 25 | AC_AG_10_200_0.05 25 26 | ACC_GTT_10_200_0.05 50 27 | AAG_AG_10_200_0.05 50 28 | AAG_AGG_10_200_0.05 56 29 | AAAG_AG_10_200_0.05 61 30 | AAAG_AG_AAAG_10_200_0.05 160 31 | AAAG_AG_AGGG_AG_AAAG_10_200_0.05 240 32 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.05 452 33 | AC_AG_10_200_0.1 24 34 | ACC_GTT_10_200_0.1 42 35 | AAG_AG_10_200_0.1 49 36 | AAG_AGG_10_200_0.1 54 37 | AAAG_AG_10_200_0.1 60 38 | AAAG_AG_AAAG_10_200_0.1 154 39 | AAAG_AG_AGGG_AG_AAAG_10_200_0.1 226 40 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.1 516 41 | AC_AG_10_200_0.15 21 42 | ACC_GTT_10_200_0.15 38 43 | AAG_AG_10_200_0.15 43 44 | AAG_AGG_10_200_0.15 49 45 | AAAG_AG_10_200_0.15 55 46 | AAAG_AG_AAAG_10_200_0.15 133 47 | AAAG_AG_AGGG_AG_AAAG_10_200_0.15 203 48 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.15 505 49 | -------------------------------------------------------------------------------- /test_public/tmp/accuracy_TRF_allowance0.txt: -------------------------------------------------------------------------------- 1 | AC_AG_10_200_0.0 1000 1000 2 | ACC_GTT_10_200_0.0 1000 1000 3 | AAG_AG_10_200_0.0 1000 1000 4 | AAG_AGG_10_200_0.0 1000 1000 5 | AAAG_AG_10_200_0.0 1000 1000 6 | AAAG_AG_AAAG_10_200_0.0 2 1000 7 | AAAG_AG_AGGG_AG_AAAG_10_200_0.0 0 1000 8 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.0 0 1000 9 | AC_AG_10_200_0.01 679 1000 10 | ACC_GTT_10_200_0.01 880 1000 11 | AAG_AG_10_200_0.01 508 1000 12 | AAG_AGG_10_200_0.01 895 1000 13 | AAAG_AG_10_200_0.01 464 1000 14 | AAAG_AG_AAAG_10_200_0.01 1 1000 15 | AAAG_AG_AGGG_AG_AAAG_10_200_0.01 2 1000 16 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.01 0 1000 17 | AC_AG_10_200_0.03 258 1000 18 | ACC_GTT_10_200_0.03 528 1000 19 | AAG_AG_10_200_0.03 178 1000 20 | AAG_AGG_10_200_0.03 549 1000 21 | AAAG_AG_10_200_0.03 125 1000 22 | AAAG_AG_AAAG_10_200_0.03 1 1000 23 | AAAG_AG_AGGG_AG_AAAG_10_200_0.03 0 1000 24 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.03 0 1000 25 | AC_AG_10_200_0.05 111 1000 26 | ACC_GTT_10_200_0.05 208 1000 27 | AAG_AG_10_200_0.05 78 1000 28 | AAG_AGG_10_200_0.05 294 1000 29 | AAAG_AG_10_200_0.05 51 1000 30 | AAAG_AG_AAAG_10_200_0.05 0 1000 31 | AAAG_AG_AGGG_AG_AAAG_10_200_0.05 1 1000 32 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.05 0 1000 33 | AC_AG_10_200_0.1 20 1000 34 | ACC_GTT_10_200_0.1 30 1000 35 | AAG_AG_10_200_0.1 4 1000 36 | AAG_AGG_10_200_0.1 41 1000 37 | AAAG_AG_10_200_0.1 7 1000 38 | AAAG_AG_AAAG_10_200_0.1 1 1000 39 | AAAG_AG_AGGG_AG_AAAG_10_200_0.1 0 1000 40 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.1 0 1000 41 | AC_AG_10_200_0.15 4 1000 42 | ACC_GTT_10_200_0.15 0 1000 43 | AAG_AG_10_200_0.15 1 1000 44 | AAG_AGG_10_200_0.15 1 1000 45 | AAAG_AG_10_200_0.15 1 1000 46 | AAAG_AG_AAAG_10_200_0.15 0 1000 47 | AAAG_AG_AGGG_AG_AAAG_10_200_0.15 0 1000 48 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.15 0 1000 49 | -------------------------------------------------------------------------------- /test_public/tmp/accuracy_RM_allowance0.txt: -------------------------------------------------------------------------------- 1 | AC_AG_10_200_0.0 1000/1000 2 | ACC_GTT_10_200_0.0 1000/1000 3 | AAG_AG_10_200_0.0 1000/1000 4 | AAG_AGG_10_200_0.0 1000/1000 5 | AAAG_AG_10_200_0.0 1000/1000 6 | AAAG_AG_AAAG_10_200_0.0 47/1000 7 | AAAG_AG_AGGG_AG_AAAG_10_200_0.0 0/1000 8 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.0 0/916 9 | AC_AG_10_200_0.01 870/1000 10 | ACC_GTT_10_200_0.01 925/1000 11 | AAG_AG_10_200_0.01 816/1000 12 | AAG_AGG_10_200_0.01 940/1000 13 | AAAG_AG_10_200_0.01 863/1000 14 | AAAG_AG_AAAG_10_200_0.01 28/1000 15 | AAAG_AG_AGGG_AG_AAAG_10_200_0.01 24/1000 16 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.01 0/962 17 | AC_AG_10_200_0.03 596/1000 18 | ACC_GTT_10_200_0.03 706/1000 19 | AAG_AG_10_200_0.03 535/999 20 | AAG_AGG_10_200_0.03 705/1000 21 | AAAG_AG_10_200_0.03 634/1000 22 | AAAG_AG_AAAG_10_200_0.03 34/1000 23 | AAAG_AG_AGGG_AG_AAAG_10_200_0.03 22/1000 24 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.03 0/1000 25 | AC_AG_10_200_0.05 455/1000 26 | ACC_GTT_10_200_0.05 504/1000 27 | AAG_AG_10_200_0.05 296/996 28 | AAG_AGG_10_200_0.05 401/1000 29 | AAAG_AG_10_200_0.05 375/1000 30 | AAAG_AG_AAAG_10_200_0.05 22/1000 31 | AAAG_AG_AGGG_AG_AAAG_10_200_0.05 6/1000 32 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.05 0/989 33 | AC_AG_10_200_0.1 283/1000 34 | ACC_GTT_10_200_0.1 284/1000 35 | AAG_AG_10_200_0.1 10/998 36 | AAG_AGG_10_200_0.1 10/1000 37 | AAAG_AG_10_200_0.1 24/1000 38 | AAAG_AG_AAAG_10_200_0.1 1/1000 39 | AAAG_AG_AGGG_AG_AAAG_10_200_0.1 0/1000 40 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.1 0/878 41 | AC_AG_10_200_0.15 123/1000 42 | ACC_GTT_10_200_0.15 158/1000 43 | AAG_AG_10_200_0.15 3/997 44 | AAG_AGG_10_200_0.15 1/1000 45 | AAAG_AG_10_200_0.15 2/1000 46 | AAAG_AG_AAAG_10_200_0.15 0/1000 47 | AAAG_AG_AGGG_AG_AAAG_10_200_0.15 0/1000 48 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.15 0/931 49 | -------------------------------------------------------------------------------- /test_public/tmp/accuracy_TRF_allowance0.01.txt: -------------------------------------------------------------------------------- 1 | AC_AG_10_200_0.0 1000 1000 2 | ACC_GTT_10_200_0.0 1000 1000 3 | AAG_AG_10_200_0.0 1000 1000 4 | AAG_AGG_10_200_0.0 1000 1000 5 | AAAG_AG_10_200_0.0 1000 1000 6 | AAAG_AG_AAAG_10_200_0.0 2 1000 7 | AAAG_AG_AGGG_AG_AAAG_10_200_0.0 159 1000 8 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.0 0 1000 9 | AC_AG_10_200_0.01 952 1000 10 | ACC_GTT_10_200_0.01 915 1000 11 | AAG_AG_10_200_0.01 831 1000 12 | AAG_AGG_10_200_0.01 909 1000 13 | AAAG_AG_10_200_0.01 748 1000 14 | AAAG_AG_AAAG_10_200_0.01 5 1000 15 | AAAG_AG_AGGG_AG_AAAG_10_200_0.01 63 1000 16 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.01 0 1000 17 | AC_AG_10_200_0.03 589 1000 18 | ACC_GTT_10_200_0.03 601 1000 19 | AAG_AG_10_200_0.03 408 1000 20 | AAG_AGG_10_200_0.03 583 1000 21 | AAAG_AG_10_200_0.03 276 1000 22 | AAAG_AG_AAAG_10_200_0.03 3 1000 23 | AAAG_AG_AGGG_AG_AAAG_10_200_0.03 3 1000 24 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.03 0 1000 25 | AC_AG_10_200_0.05 300 1000 26 | ACC_GTT_10_200_0.05 299 1000 27 | AAG_AG_10_200_0.05 173 1000 28 | AAG_AGG_10_200_0.05 345 1000 29 | AAAG_AG_10_200_0.05 110 1000 30 | AAAG_AG_AAAG_10_200_0.05 1 1000 31 | AAAG_AG_AGGG_AG_AAAG_10_200_0.05 3 1000 32 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.05 0 1000 33 | AC_AG_10_200_0.1 62 1000 34 | ACC_GTT_10_200_0.1 56 1000 35 | AAG_AG_10_200_0.1 18 1000 36 | AAG_AGG_10_200_0.1 61 1000 37 | AAAG_AG_10_200_0.1 18 1000 38 | AAAG_AG_AAAG_10_200_0.1 1 1000 39 | AAAG_AG_AGGG_AG_AAAG_10_200_0.1 0 1000 40 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.1 0 1000 41 | AC_AG_10_200_0.15 15 1000 42 | ACC_GTT_10_200_0.15 3 1000 43 | AAG_AG_10_200_0.15 5 1000 44 | AAG_AGG_10_200_0.15 4 1000 45 | AAAG_AG_10_200_0.15 2 1000 46 | AAAG_AG_AAAG_10_200_0.15 0 1000 47 | AAAG_AG_AGGG_AG_AAAG_10_200_0.15 0 1000 48 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.15 0 1000 49 | -------------------------------------------------------------------------------- /test_public/tmp/accuracy_TRF_allowance0.02.txt: -------------------------------------------------------------------------------- 1 | AC_AG_10_200_0.0 1000 1000 2 | ACC_GTT_10_200_0.0 1000 1000 3 | AAG_AG_10_200_0.0 1000 1000 4 | AAG_AGG_10_200_0.0 1000 1000 5 | AAAG_AG_10_200_0.0 1000 1000 6 | AAAG_AG_AAAG_10_200_0.0 2 1000 7 | AAAG_AG_AGGG_AG_AAAG_10_200_0.0 159 1000 8 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.0 0 1000 9 | AC_AG_10_200_0.01 973 1000 10 | ACC_GTT_10_200_0.01 916 1000 11 | AAG_AG_10_200_0.01 888 1000 12 | AAG_AGG_10_200_0.01 912 1000 13 | AAAG_AG_10_200_0.01 786 1000 14 | AAAG_AG_AAAG_10_200_0.01 5 1000 15 | AAAG_AG_AGGG_AG_AAAG_10_200_0.01 71 1000 16 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.01 0 1000 17 | AC_AG_10_200_0.03 776 1000 18 | ACC_GTT_10_200_0.03 605 1000 19 | AAG_AG_10_200_0.03 543 1000 20 | AAG_AGG_10_200_0.03 590 1000 21 | AAAG_AG_10_200_0.03 357 1000 22 | AAAG_AG_AAAG_10_200_0.03 7 1000 23 | AAAG_AG_AGGG_AG_AAAG_10_200_0.03 4 1000 24 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.03 0 1000 25 | AC_AG_10_200_0.05 459 1000 26 | ACC_GTT_10_200_0.05 314 1000 27 | AAG_AG_10_200_0.05 251 1000 28 | AAG_AGG_10_200_0.05 353 1000 29 | AAAG_AG_10_200_0.05 151 1000 30 | AAAG_AG_AAAG_10_200_0.05 2 1000 31 | AAAG_AG_AGGG_AG_AAAG_10_200_0.05 4 1000 32 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.05 0 1000 33 | AC_AG_10_200_0.1 100 1000 34 | ACC_GTT_10_200_0.1 64 1000 35 | AAG_AG_10_200_0.1 25 1000 36 | AAG_AGG_10_200_0.1 64 1000 37 | AAAG_AG_10_200_0.1 22 1000 38 | AAAG_AG_AAAG_10_200_0.1 1 1000 39 | AAAG_AG_AGGG_AG_AAAG_10_200_0.1 0 1000 40 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.1 0 1000 41 | AC_AG_10_200_0.15 19 1000 42 | ACC_GTT_10_200_0.15 4 1000 43 | AAG_AG_10_200_0.15 5 1000 44 | AAG_AGG_10_200_0.15 5 1000 45 | AAAG_AG_10_200_0.15 2 1000 46 | AAAG_AG_AAAG_10_200_0.15 0 1000 47 | AAAG_AG_AGGG_AG_AAAG_10_200_0.15 0 1000 48 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.15 0 1000 49 | -------------------------------------------------------------------------------- /test_public/tmp/accuracy_TRF_allowance0.03.txt: -------------------------------------------------------------------------------- 1 | AC_AG_10_200_0.0 1000 1000 2 | ACC_GTT_10_200_0.0 1000 1000 3 | AAG_AG_10_200_0.0 1000 1000 4 | AAG_AGG_10_200_0.0 1000 1000 5 | AAAG_AG_10_200_0.0 1000 1000 6 | AAAG_AG_AAAG_10_200_0.0 2 1000 7 | AAAG_AG_AGGG_AG_AAAG_10_200_0.0 159 1000 8 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.0 0 1000 9 | AC_AG_10_200_0.01 977 1000 10 | ACC_GTT_10_200_0.01 916 1000 11 | AAG_AG_10_200_0.01 902 1000 12 | AAG_AGG_10_200_0.01 912 1000 13 | AAAG_AG_10_200_0.01 795 1000 14 | AAAG_AG_AAAG_10_200_0.01 5 1000 15 | AAAG_AG_AGGG_AG_AAAG_10_200_0.01 76 1000 16 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.01 0 1000 17 | AC_AG_10_200_0.03 844 1000 18 | ACC_GTT_10_200_0.03 605 1000 19 | AAG_AG_10_200_0.03 591 1000 20 | AAG_AGG_10_200_0.03 591 1000 21 | AAAG_AG_10_200_0.03 404 1000 22 | AAAG_AG_AAAG_10_200_0.03 8 1000 23 | AAAG_AG_AGGG_AG_AAAG_10_200_0.03 9 1000 24 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.03 0 1000 25 | AC_AG_10_200_0.05 580 1000 26 | ACC_GTT_10_200_0.05 315 1000 27 | AAG_AG_10_200_0.05 294 1000 28 | AAG_AGG_10_200_0.05 356 1000 29 | AAAG_AG_10_200_0.05 195 1000 30 | AAAG_AG_AAAG_10_200_0.05 3 1000 31 | AAAG_AG_AGGG_AG_AAAG_10_200_0.05 6 1000 32 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.05 0 1000 33 | AC_AG_10_200_0.1 144 1000 34 | ACC_GTT_10_200_0.1 68 1000 35 | AAG_AG_10_200_0.1 43 1000 36 | AAG_AGG_10_200_0.1 68 1000 37 | AAAG_AG_10_200_0.1 32 1000 38 | AAAG_AG_AAAG_10_200_0.1 1 1000 39 | AAAG_AG_AGGG_AG_AAAG_10_200_0.1 0 1000 40 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.1 0 1000 41 | AC_AG_10_200_0.15 22 1000 42 | ACC_GTT_10_200_0.15 4 1000 43 | AAG_AG_10_200_0.15 6 1000 44 | AAG_AGG_10_200_0.15 5 1000 45 | AAAG_AG_10_200_0.15 2 1000 46 | AAAG_AG_AAAG_10_200_0.15 0 1000 47 | AAAG_AG_AGGG_AG_AAAG_10_200_0.15 0 1000 48 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.15 0 1000 49 | -------------------------------------------------------------------------------- /test_public/tmp/accuracy_RM_allowance0.01.txt: -------------------------------------------------------------------------------- 1 | AC_AG_10_200_0.0 1000/1000 2 | ACC_GTT_10_200_0.0 1000/1000 3 | AAG_AG_10_200_0.0 1000/1000 4 | AAG_AGG_10_200_0.0 1000/1000 5 | AAAG_AG_10_200_0.0 1000/1000 6 | AAAG_AG_AAAG_10_200_0.0 48/1000 7 | AAAG_AG_AGGG_AG_AAAG_10_200_0.0 495/1000 8 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.0 0/916 9 | AC_AG_10_200_0.01 992/1000 10 | ACC_GTT_10_200_0.01 999/1000 11 | AAG_AG_10_200_0.01 958/1000 12 | AAG_AGG_10_200_0.01 995/1000 13 | AAAG_AG_10_200_0.01 978/1000 14 | AAAG_AG_AAAG_10_200_0.01 46/1000 15 | AAAG_AG_AGGG_AG_AAAG_10_200_0.01 209/1000 16 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.01 0/962 17 | AC_AG_10_200_0.03 947/1000 18 | ACC_GTT_10_200_0.03 967/1000 19 | AAG_AG_10_200_0.03 838/999 20 | AAG_AGG_10_200_0.03 965/1000 21 | AAAG_AG_10_200_0.03 909/1000 22 | AAAG_AG_AAAG_10_200_0.03 55/1000 23 | AAAG_AG_AGGG_AG_AAAG_10_200_0.03 158/1000 24 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.03 0/1000 25 | AC_AG_10_200_0.05 872/1000 26 | ACC_GTT_10_200_0.05 904/1000 27 | AAG_AG_10_200_0.05 539/996 28 | AAG_AGG_10_200_0.05 700/1000 29 | AAAG_AG_10_200_0.05 685/1000 30 | AAAG_AG_AAAG_10_200_0.05 47/1000 31 | AAAG_AG_AGGG_AG_AAAG_10_200_0.05 89/1000 32 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.05 0/989 33 | AC_AG_10_200_0.1 617/1000 34 | ACC_GTT_10_200_0.1 662/1000 35 | AAG_AG_10_200_0.1 15/998 36 | AAG_AGG_10_200_0.1 35/1000 37 | AAAG_AG_10_200_0.1 44/1000 38 | AAAG_AG_AAAG_10_200_0.1 1/1000 39 | AAAG_AG_AGGG_AG_AAAG_10_200_0.1 0/1000 40 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.1 0/878 41 | AC_AG_10_200_0.15 365/1000 42 | ACC_GTT_10_200_0.15 464/1000 43 | AAG_AG_10_200_0.15 5/997 44 | AAG_AGG_10_200_0.15 5/1000 45 | AAAG_AG_10_200_0.15 3/1000 46 | AAAG_AG_AAAG_10_200_0.15 0/1000 47 | AAAG_AG_AGGG_AG_AAAG_10_200_0.15 0/1000 48 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.15 0/931 49 | -------------------------------------------------------------------------------- /test_public/tmp/accuracy_RM_allowance0.02.txt: -------------------------------------------------------------------------------- 1 | AC_AG_10_200_0.0 1000/1000 2 | ACC_GTT_10_200_0.0 1000/1000 3 | AAG_AG_10_200_0.0 1000/1000 4 | AAG_AGG_10_200_0.0 1000/1000 5 | AAAG_AG_10_200_0.0 1000/1000 6 | AAAG_AG_AAAG_10_200_0.0 48/1000 7 | AAAG_AG_AGGG_AG_AAAG_10_200_0.0 495/1000 8 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.0 0/916 9 | AC_AG_10_200_0.01 998/1000 10 | ACC_GTT_10_200_0.01 999/1000 11 | AAG_AG_10_200_0.01 963/1000 12 | AAG_AGG_10_200_0.01 1000/1000 13 | AAAG_AG_10_200_0.01 988/1000 14 | AAAG_AG_AAAG_10_200_0.01 47/1000 15 | AAAG_AG_AGGG_AG_AAAG_10_200_0.01 221/1000 16 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.01 0/962 17 | AC_AG_10_200_0.03 979/1000 18 | ACC_GTT_10_200_0.03 991/1000 19 | AAG_AG_10_200_0.03 866/999 20 | AAG_AGG_10_200_0.03 982/1000 21 | AAAG_AG_10_200_0.03 937/1000 22 | AAAG_AG_AAAG_10_200_0.03 64/1000 23 | AAAG_AG_AGGG_AG_AAAG_10_200_0.03 180/1000 24 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.03 0/1000 25 | AC_AG_10_200_0.05 936/1000 26 | ACC_GTT_10_200_0.05 962/1000 27 | AAG_AG_10_200_0.05 594/996 28 | AAG_AGG_10_200_0.05 738/1000 29 | AAAG_AG_10_200_0.05 736/1000 30 | AAAG_AG_AAAG_10_200_0.05 52/1000 31 | AAAG_AG_AGGG_AG_AAAG_10_200_0.05 115/1000 32 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.05 0/989 33 | AC_AG_10_200_0.1 782/1000 34 | ACC_GTT_10_200_0.1 798/1000 35 | AAG_AG_10_200_0.1 17/998 36 | AAG_AGG_10_200_0.1 37/1000 37 | AAAG_AG_10_200_0.1 50/1000 38 | AAAG_AG_AAAG_10_200_0.1 1/1000 39 | AAAG_AG_AGGG_AG_AAAG_10_200_0.1 0/1000 40 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.1 0/878 41 | AC_AG_10_200_0.15 495/1000 42 | ACC_GTT_10_200_0.15 618/1000 43 | AAG_AG_10_200_0.15 5/997 44 | AAG_AGG_10_200_0.15 5/1000 45 | AAAG_AG_10_200_0.15 3/1000 46 | AAAG_AG_AAAG_10_200_0.15 0/1000 47 | AAAG_AG_AGGG_AG_AAAG_10_200_0.15 0/1000 48 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.15 0/931 49 | -------------------------------------------------------------------------------- /test_public/tmp/accuracy_RM_allowance0.03.txt: -------------------------------------------------------------------------------- 1 | AC_AG_10_200_0.0 1000/1000 2 | ACC_GTT_10_200_0.0 1000/1000 3 | AAG_AG_10_200_0.0 1000/1000 4 | AAG_AGG_10_200_0.0 1000/1000 5 | AAAG_AG_10_200_0.0 1000/1000 6 | AAAG_AG_AAAG_10_200_0.0 48/1000 7 | AAAG_AG_AGGG_AG_AAAG_10_200_0.0 495/1000 8 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.0 0/916 9 | AC_AG_10_200_0.01 998/1000 10 | ACC_GTT_10_200_0.01 999/1000 11 | AAG_AG_10_200_0.01 967/1000 12 | AAG_AGG_10_200_0.01 1000/1000 13 | AAAG_AG_10_200_0.01 990/1000 14 | AAAG_AG_AAAG_10_200_0.01 47/1000 15 | AAAG_AG_AGGG_AG_AAAG_10_200_0.01 232/1000 16 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.01 0/962 17 | AC_AG_10_200_0.03 991/1000 18 | ACC_GTT_10_200_0.03 992/1000 19 | AAG_AG_10_200_0.03 882/999 20 | AAG_AGG_10_200_0.03 987/1000 21 | AAAG_AG_10_200_0.03 946/1000 22 | AAAG_AG_AAAG_10_200_0.03 65/1000 23 | AAAG_AG_AGGG_AG_AAAG_10_200_0.03 206/1000 24 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.03 0/1000 25 | AC_AG_10_200_0.05 965/1000 26 | ACC_GTT_10_200_0.05 973/1000 27 | AAG_AG_10_200_0.05 611/996 28 | AAG_AGG_10_200_0.05 748/1000 29 | AAAG_AG_10_200_0.05 753/1000 30 | AAAG_AG_AAAG_10_200_0.05 53/1000 31 | AAAG_AG_AGGG_AG_AAAG_10_200_0.05 126/1000 32 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.05 0/989 33 | AC_AG_10_200_0.1 864/1000 34 | ACC_GTT_10_200_0.1 869/1000 35 | AAG_AG_10_200_0.1 22/998 36 | AAG_AGG_10_200_0.1 37/1000 37 | AAAG_AG_10_200_0.1 52/1000 38 | AAAG_AG_AAAG_10_200_0.1 1/1000 39 | AAAG_AG_AGGG_AG_AAAG_10_200_0.1 0/1000 40 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.1 0/878 41 | AC_AG_10_200_0.15 581/1000 42 | ACC_GTT_10_200_0.15 725/1000 43 | AAG_AG_10_200_0.15 5/997 44 | AAG_AGG_10_200_0.15 5/1000 45 | AAAG_AG_10_200_0.15 4/1000 46 | AAAG_AG_AAAG_10_200_0.15 0/1000 47 | AAAG_AG_AGGG_AG_AAAG_10_200_0.15 0/1000 48 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.15 0/931 49 | -------------------------------------------------------------------------------- /test_public/tmp/accuracy_uTR.txt: -------------------------------------------------------------------------------- 1 | AC_AG_10_200_0.0 992 1000 414 2 | ACC_GTT_10_200_0.0 998 1000 617 3 | AAG_AG_10_200_0.0 991 1000 524 4 | AAG_AGG_10_200_0.0 998 1000 617 5 | AAAG_AG_10_200_0.0 982 1000 624 6 | AAAG_AG_AAAG_10_200_0.0 922 1000 1040 7 | AAAG_AG_AGGG_AG_AAAG_10_200_0.0 957 1000 1674 8 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.0 996 1000 2617 9 | AC_AG_10_200_0.01 996 1000 421 10 | ACC_GTT_10_200_0.01 1000 1000 636 11 | AAG_AG_10_200_0.01 992 1000 523 12 | AAG_AGG_10_200_0.01 999 1000 617 13 | AAAG_AG_10_200_0.01 990 1000 628 14 | AAAG_AG_AAAG_10_200_0.01 940 1000 1042 15 | AAAG_AG_AGGG_AG_AAAG_10_200_0.01 948 1000 1671 16 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.01 999 1000 2564 17 | AC_AG_10_200_0.03 992 1000 413 18 | ACC_GTT_10_200_0.03 999 1000 627 19 | AAG_AG_10_200_0.03 983 1000 514 20 | AAG_AGG_10_200_0.03 999 1000 633 21 | AAAG_AG_10_200_0.03 987 1000 625 22 | AAAG_AG_AAAG_10_200_0.03 948 1000 1031 23 | AAAG_AG_AGGG_AG_AAAG_10_200_0.03 934 1000 1670 24 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.03 958 1000 2608 25 | AC_AG_10_200_0.05 993 1000 416 26 | ACC_GTT_10_200_0.05 999 1000 650 27 | AAG_AG_10_200_0.05 971 1000 528 28 | AAG_AGG_10_200_0.05 998 1000 619 29 | AAAG_AG_10_200_0.05 973 1000 640 30 | AAAG_AG_AAAG_10_200_0.05 924 1000 1053 31 | AAAG_AG_AGGG_AG_AAAG_10_200_0.05 925 1000 1704 32 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.05 306 1000 2599 33 | AC_AG_10_200_0.1 983 1000 415 34 | ACC_GTT_10_200_0.1 995 1000 612 35 | AAG_AG_10_200_0.1 885 1000 529 36 | AAG_AGG_10_200_0.1 984 1000 629 37 | AAAG_AG_10_200_0.1 943 1000 629 38 | AAAG_AG_AAAG_10_200_0.1 886 1000 1047 39 | AAAG_AG_AGGG_AG_AAAG_10_200_0.1 819 1000 1681 40 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.1 1 1000 2629 41 | AC_AG_10_200_0.15 963 1000 419 42 | ACC_GTT_10_200_0.15 946 1000 614 43 | AAG_AG_10_200_0.15 660 1000 521 44 | AAG_AGG_10_200_0.15 908 1000 636 45 | AAAG_AG_10_200_0.15 793 1000 627 46 | AAAG_AG_AAAG_10_200_0.15 638 1000 1035 47 | AAAG_AG_AGGG_AG_AAAG_10_200_0.15 539 1000 1668 48 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.15 0 1000 2625 49 | -------------------------------------------------------------------------------- /test_public/tmp/accuracy_uTR_allowance0.txt: -------------------------------------------------------------------------------- 1 | AC_AG_10_200_0.0 992 1000 414 2 | ACC_GTT_10_200_0.0 998 1000 617 3 | AAG_AG_10_200_0.0 991 1000 524 4 | AAG_AGG_10_200_0.0 998 1000 617 5 | AAAG_AG_10_200_0.0 982 1000 624 6 | AAAG_AG_AAAG_10_200_0.0 922 1000 1040 7 | AAAG_AG_AGGG_AG_AAAG_10_200_0.0 954 1000 1674 8 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.0 996 1000 2617 9 | AC_AG_10_200_0.01 923 1000 421 10 | ACC_GTT_10_200_0.01 967 1000 636 11 | AAG_AG_10_200_0.01 849 1000 523 12 | AAG_AGG_10_200_0.01 926 1000 617 13 | AAAG_AG_10_200_0.01 878 1000 628 14 | AAAG_AG_AAAG_10_200_0.01 771 1000 1042 15 | AAAG_AG_AGGG_AG_AAAG_10_200_0.01 641 1000 1671 16 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.01 986 1000 2564 17 | AC_AG_10_200_0.03 664 1000 413 18 | ACC_GTT_10_200_0.03 809 1000 627 19 | AAG_AG_10_200_0.03 556 1000 514 20 | AAG_AGG_10_200_0.03 705 1000 633 21 | AAAG_AG_10_200_0.03 633 1000 625 22 | AAAG_AG_AAAG_10_200_0.03 430 1000 1031 23 | AAAG_AG_AGGG_AG_AAAG_10_200_0.03 234 1000 1670 24 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.03 854 1000 2608 25 | AC_AG_10_200_0.05 502 1000 416 26 | ACC_GTT_10_200_0.05 659 1000 650 27 | AAG_AG_10_200_0.05 361 1000 528 28 | AAG_AGG_10_200_0.05 524 1000 619 29 | AAAG_AG_10_200_0.05 444 1000 640 30 | AAAG_AG_AAAG_10_200_0.05 268 1000 1053 31 | AAAG_AG_AGGG_AG_AAAG_10_200_0.05 104 1000 1704 32 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.05 223 1000 2599 33 | AC_AG_10_200_0.1 286 1000 415 34 | ACC_GTT_10_200_0.1 515 1000 612 35 | AAG_AG_10_200_0.1 191 1000 529 36 | AAG_AGG_10_200_0.1 330 1000 629 37 | AAAG_AG_10_200_0.1 266 1000 629 38 | AAAG_AG_AAAG_10_200_0.1 121 1000 1047 39 | AAAG_AG_AGGG_AG_AAAG_10_200_0.1 27 1000 1681 40 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.1 0 1000 2629 41 | AC_AG_10_200_0.15 219 1000 419 42 | ACC_GTT_10_200_0.15 327 1000 614 43 | AAG_AG_10_200_0.15 90 1000 521 44 | AAG_AGG_10_200_0.15 223 1000 636 45 | AAAG_AG_10_200_0.15 176 1000 627 46 | AAAG_AG_AAAG_10_200_0.15 67 1000 1035 47 | AAAG_AG_AGGG_AG_AAAG_10_200_0.15 4 1000 1668 48 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.15 0 1000 2625 49 | -------------------------------------------------------------------------------- /test_public/tmp/accuracy_uTR_allowance0.01.txt: -------------------------------------------------------------------------------- 1 | AC_AG_10_200_0.0 992 1000 414 2 | ACC_GTT_10_200_0.0 998 1000 617 3 | AAG_AG_10_200_0.0 991 1000 524 4 | AAG_AGG_10_200_0.0 998 1000 617 5 | AAAG_AG_10_200_0.0 982 1000 624 6 | AAAG_AG_AAAG_10_200_0.0 922 1000 1040 7 | AAAG_AG_AGGG_AG_AAAG_10_200_0.0 957 1000 1674 8 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.0 996 1000 2617 9 | AC_AG_10_200_0.01 994 1000 421 10 | ACC_GTT_10_200_0.01 1000 1000 636 11 | AAG_AG_10_200_0.01 980 1000 523 12 | AAG_AGG_10_200_0.01 998 1000 617 13 | AAAG_AG_10_200_0.01 978 1000 628 14 | AAAG_AG_AAAG_10_200_0.01 928 1000 1042 15 | AAAG_AG_AGGG_AG_AAAG_10_200_0.01 910 1000 1671 16 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.01 999 1000 2564 17 | AC_AG_10_200_0.03 964 1000 413 18 | ACC_GTT_10_200_0.03 995 1000 627 19 | AAG_AG_10_200_0.03 876 1000 514 20 | AAG_AGG_10_200_0.03 981 1000 633 21 | AAAG_AG_10_200_0.03 941 1000 625 22 | AAAG_AG_AAAG_10_200_0.03 816 1000 1031 23 | AAAG_AG_AGGG_AG_AAAG_10_200_0.03 680 1000 1670 24 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.03 956 1000 2608 25 | AC_AG_10_200_0.05 910 1000 416 26 | ACC_GTT_10_200_0.05 978 1000 650 27 | AAG_AG_10_200_0.05 753 1000 528 28 | AAG_AGG_10_200_0.05 893 1000 619 29 | AAAG_AG_10_200_0.05 840 1000 640 30 | AAAG_AG_AAAG_10_200_0.05 714 1000 1053 31 | AAAG_AG_AGGG_AG_AAAG_10_200_0.05 518 1000 1704 32 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.05 302 1000 2599 33 | AC_AG_10_200_0.1 707 1000 415 34 | ACC_GTT_10_200_0.1 894 1000 612 35 | AAG_AG_10_200_0.1 465 1000 529 36 | AAG_AGG_10_200_0.1 719 1000 629 37 | AAAG_AG_10_200_0.1 619 1000 629 38 | AAAG_AG_AAAG_10_200_0.1 478 1000 1047 39 | AAAG_AG_AGGG_AG_AAAG_10_200_0.1 222 1000 1681 40 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.1 1 1000 2629 41 | AC_AG_10_200_0.15 563 1000 419 42 | ACC_GTT_10_200_0.15 747 1000 614 43 | AAG_AG_10_200_0.15 260 1000 521 44 | AAG_AGG_10_200_0.15 539 1000 636 45 | AAAG_AG_10_200_0.15 433 1000 627 46 | AAAG_AG_AAAG_10_200_0.15 234 1000 1035 47 | AAAG_AG_AGGG_AG_AAAG_10_200_0.15 77 1000 1668 48 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.15 0 1000 2625 49 | -------------------------------------------------------------------------------- /test_public/tmp/accuracy_uTR_allowance0.02.txt: -------------------------------------------------------------------------------- 1 | AC_AG_10_200_0.0 992 1000 414 2 | ACC_GTT_10_200_0.0 998 1000 617 3 | AAG_AG_10_200_0.0 991 1000 524 4 | AAG_AGG_10_200_0.0 998 1000 617 5 | AAAG_AG_10_200_0.0 982 1000 624 6 | AAAG_AG_AAAG_10_200_0.0 922 1000 1040 7 | AAAG_AG_AGGG_AG_AAAG_10_200_0.0 957 1000 1674 8 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.0 996 1000 2617 9 | AC_AG_10_200_0.01 996 1000 421 10 | ACC_GTT_10_200_0.01 1000 1000 636 11 | AAG_AG_10_200_0.01 990 1000 523 12 | AAG_AGG_10_200_0.01 998 1000 617 13 | AAAG_AG_10_200_0.01 987 1000 628 14 | AAAG_AG_AAAG_10_200_0.01 934 1000 1042 15 | AAAG_AG_AGGG_AG_AAAG_10_200_0.01 932 1000 1671 16 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.01 999 1000 2564 17 | AC_AG_10_200_0.03 989 1000 413 18 | ACC_GTT_10_200_0.03 999 1000 627 19 | AAG_AG_10_200_0.03 936 1000 514 20 | AAG_AGG_10_200_0.03 993 1000 633 21 | AAAG_AG_10_200_0.03 965 1000 625 22 | AAAG_AG_AAAG_10_200_0.03 884 1000 1031 23 | AAAG_AG_AGGG_AG_AAAG_10_200_0.03 799 1000 1670 24 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.03 957 1000 2608 25 | AC_AG_10_200_0.05 967 1000 416 26 | ACC_GTT_10_200_0.05 998 1000 650 27 | AAG_AG_10_200_0.05 848 1000 528 28 | AAG_AGG_10_200_0.05 962 1000 619 29 | AAAG_AG_10_200_0.05 921 1000 640 30 | AAAG_AG_AAAG_10_200_0.05 816 1000 1053 31 | AAAG_AG_AGGG_AG_AAAG_10_200_0.05 701 1000 1704 32 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.05 304 1000 2599 33 | AC_AG_10_200_0.1 852 1000 415 34 | ACC_GTT_10_200_0.1 977 1000 612 35 | AAG_AG_10_200_0.1 611 1000 529 36 | AAG_AGG_10_200_0.1 862 1000 629 37 | AAAG_AG_10_200_0.1 759 1000 629 38 | AAAG_AG_AAAG_10_200_0.1 616 1000 1047 39 | AAAG_AG_AGGG_AG_AAAG_10_200_0.1 390 1000 1681 40 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.1 1 1000 2629 41 | AC_AG_10_200_0.15 736 1000 419 42 | ACC_GTT_10_200_0.15 875 1000 614 43 | AAG_AG_10_200_0.15 374 1000 521 44 | AAG_AGG_10_200_0.15 702 1000 636 45 | AAAG_AG_10_200_0.15 564 1000 627 46 | AAAG_AG_AAAG_10_200_0.15 350 1000 1035 47 | AAAG_AG_AGGG_AG_AAAG_10_200_0.15 151 1000 1668 48 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.15 0 1000 2625 49 | -------------------------------------------------------------------------------- /test_public/tmp/accuracy_uTR_allowance0.03.txt: -------------------------------------------------------------------------------- 1 | AC_AG_10_200_0.0 992 1000 414 2 | ACC_GTT_10_200_0.0 998 1000 617 3 | AAG_AG_10_200_0.0 991 1000 524 4 | AAG_AGG_10_200_0.0 998 1000 617 5 | AAAG_AG_10_200_0.0 982 1000 624 6 | AAAG_AG_AAAG_10_200_0.0 922 1000 1040 7 | AAAG_AG_AGGG_AG_AAAG_10_200_0.0 957 1000 1674 8 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.0 996 1000 2617 9 | AC_AG_10_200_0.01 996 1000 421 10 | ACC_GTT_10_200_0.01 1000 1000 636 11 | AAG_AG_10_200_0.01 991 1000 523 12 | AAG_AGG_10_200_0.01 999 1000 617 13 | AAAG_AG_10_200_0.01 988 1000 628 14 | AAAG_AG_AAAG_10_200_0.01 937 1000 1042 15 | AAAG_AG_AGGG_AG_AAAG_10_200_0.01 939 1000 1671 16 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.01 999 1000 2564 17 | AC_AG_10_200_0.03 989 1000 413 18 | ACC_GTT_10_200_0.03 999 1000 627 19 | AAG_AG_10_200_0.03 953 1000 514 20 | AAG_AGG_10_200_0.03 996 1000 633 21 | AAAG_AG_10_200_0.03 974 1000 625 22 | AAAG_AG_AAAG_10_200_0.03 920 1000 1031 23 | AAAG_AG_AGGG_AG_AAAG_10_200_0.03 862 1000 1670 24 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.03 958 1000 2608 25 | AC_AG_10_200_0.05 984 1000 416 26 | ACC_GTT_10_200_0.05 999 1000 650 27 | AAG_AG_10_200_0.05 901 1000 528 28 | AAG_AGG_10_200_0.05 980 1000 619 29 | AAAG_AG_10_200_0.05 949 1000 640 30 | AAAG_AG_AAAG_10_200_0.05 864 1000 1053 31 | AAAG_AG_AGGG_AG_AAAG_10_200_0.05 789 1000 1704 32 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.05 304 1000 2599 33 | AC_AG_10_200_0.1 914 1000 415 34 | ACC_GTT_10_200_0.1 992 1000 612 35 | AAG_AG_10_200_0.1 712 1000 529 36 | AAG_AGG_10_200_0.1 922 1000 629 37 | AAAG_AG_10_200_0.1 833 1000 629 38 | AAAG_AG_AAAG_10_200_0.1 718 1000 1047 39 | AAAG_AG_AGGG_AG_AAAG_10_200_0.1 531 1000 1681 40 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.1 1 1000 2629 41 | AC_AG_10_200_0.15 836 1000 419 42 | ACC_GTT_10_200_0.15 914 1000 614 43 | AAG_AG_10_200_0.15 450 1000 521 44 | AAG_AGG_10_200_0.15 782 1000 636 45 | AAAG_AG_10_200_0.15 639 1000 627 46 | AAAG_AG_AAAG_10_200_0.15 414 1000 1035 47 | AAAG_AG_AGGG_AG_AAAG_10_200_0.15 229 1000 1668 48 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.15 0 1000 2625 49 | -------------------------------------------------------------------------------- /test_public/parse_RepeatMasker/main.c: -------------------------------------------------------------------------------- 1 | // Retrieve TRs from a RepeatMasker output file 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #define BLK 4096 12 | 13 | int main(int argc, char *argv[]) 14 | { 15 | 16 | int opt; 17 | int print_homopolymer = 0; 18 | char inputFile[BLK], outputFile[BLK]; 19 | while ((opt = getopt(argc, argv, "i:o:h")) != -1) { 20 | switch(opt){ 21 | case 'i': 22 | strcpy(inputFile,optarg); 23 | break; 24 | case 'o': 25 | strcpy(outputFile,optarg); 26 | break; 27 | case 'h': // Print homopolymer 28 | print_homopolymer = 1; 29 | break; 30 | default: 31 | exit(EXIT_FAILURE); 32 | } 33 | } 34 | 35 | // Count the frequency of each SNV 36 | FILE *fp_in = fopen(inputFile, "r"); 37 | FILE *fp_out = fopen(outputFile, "w"); 38 | 39 | char *s = (char *)malloc(sizeof(char)*BLK); 40 | char a1[BLK], a2[BLK], a3[BLK], a4[BLK], ID[BLK], strand[BLK], a_repeat[BLK], repeat_unit[BLK], repeat_class[BLK], others[BLK]; 41 | int beginTR, endTR, a_left; 42 | 43 | int i=0; 44 | while (fgets(s, BLK, fp_in) != NULL) { 45 | if(i++ > 2){ // Skip the first three lines 46 | //printf("%s", s); 47 | sscanf(s, "%s %s %s %s %s %d %d (%d) %s %s %s %[^\n]", 48 | a1,a2,a3,a4,ID,&beginTR,&endTR,&a_left,strand,a_repeat,repeat_class,others); 49 | if(strcmp(repeat_class,"Simple_repeat")==0){ 50 | sscanf(a_repeat,"(%[^)])n",repeat_unit); 51 | strcpy(a_repeat,repeat_unit); 52 | } 53 | if(1 < strlen(a_repeat) || print_homopolymer == 1){ 54 | fprintf(fp_out, "%s\t%d\t%d\t%d\t%d\t%s\t%s\n", ID,(endTR+a_left),beginTR,endTR,(endTR-beginTR+1),a_repeat,repeat_class); 55 | } 56 | } 57 | } 58 | fclose(fp_in); 59 | fclose(fp_out); 60 | 61 | free(s); 62 | 63 | return EXIT_SUCCESS; 64 | } 65 | -------------------------------------------------------------------------------- /nsop_test/nsop_compression.csv: -------------------------------------------------------------------------------- 1 | AC_AG_10_200_0.0 0.011494 2 174 2 | ACC_GTT_10_200_0.0 0.017021 4 235 3 | AAG_AG_10_200_0.0 0.018182 2 110 4 | AAG_AGG_10_200_0.0 0.022059 3 136 5 | AAAG_AG_10_200_0.0 0.026087 3 115 6 | AAAG_AG_AAAG_10_200_0.0 0.019737 3 152 7 | AAAG_AG_AGGG_AG_AAAG_10_200_0.0 0.018182 5 275 8 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.0 0.202290 53 262 9 | AC_AG_10_200_0.01 0.018305 208 11363 10 | ACC_GTT_10_200_0.01 0.019250 351 18234 11 | AAG_AG_10_200_0.01 0.027468 278 10121 12 | AAG_AGG_10_200_0.01 0.028037 346 12341 13 | AAAG_AG_10_200_0.01 0.035021 366 10451 14 | AAAG_AG_AAAG_10_200_0.01 0.027636 373 13497 15 | AAAG_AG_AGGG_AG_AAAG_10_200_0.01 0.030392 652 21453 16 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.01 0.058439 1993 34104 17 | AC_AG_10_200_0.03 0.011574 362 31278 18 | ACC_GTT_10_200_0.03 0.012996 600 46168 19 | AAG_AG_10_200_0.03 0.015381 433 28152 20 | AAG_AGG_10_200_0.03 0.014526 502 34558 21 | AAAG_AG_10_200_0.03 0.020507 600 29259 22 | AAAG_AG_AAAG_10_200_0.03 0.020685 770 37225 23 | AAAG_AG_AGGG_AG_AAAG_10_200_0.03 0.021847 1331 60923 24 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.03 0.030139 2928 97149 25 | AC_AG_10_200_0.05 0.012278 704 57340 26 | ACC_GTT_10_200_0.05 0.012714 1050 82586 27 | AAG_AG_10_200_0.05 0.015096 781 51734 28 | AAG_AGG_10_200_0.05 0.017897 1117 62414 29 | AAAG_AG_10_200_0.05 0.019228 1050 54607 30 | AAAG_AG_AAAG_10_200_0.05 0.023374 1557 66612 31 | AAAG_AG_AGGG_AG_AAAG_10_200_0.05 0.022826 2537 111143 32 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.05 0.029978 5146 171657 33 | AC_AG_10_200_0.1 0.011350 1643 144763 34 | ACC_GTT_10_200_0.1 0.011454 2348 204987 35 | AAG_AG_10_200_0.1 0.013429 1821 135607 36 | AAG_AGG_10_200_0.1 0.013660 2242 164131 37 | AAAG_AG_10_200_0.1 0.015586 2122 136146 38 | AAAG_AG_AAAG_10_200_0.1 0.017713 3037 171459 39 | AAAG_AG_AGGG_AG_AAAG_10_200_0.1 0.017846 5037 282244 40 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.1 0.023757 10297 433435 41 | AC_AG_10_200_0.15 0.008204 2179 265609 42 | ACC_GTT_10_200_0.15 0.009086 3317 365081 43 | AAG_AG_10_200_0.15 0.010143 2494 245883 44 | AAG_AGG_10_200_0.15 0.011134 3344 300334 45 | AAAG_AG_10_200_0.15 0.012219 3004 245846 46 | AAAG_AG_AAAG_10_200_0.15 0.014500 4542 313242 47 | AAAG_AG_AGGG_AG_AAAG_10_200_0.15 0.014307 7303 510446 48 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.15 0.018779 14522 773303 49 | -------------------------------------------------------------------------------- /test_public/tmp/time_uTR.txt: -------------------------------------------------------------------------------- 1 | AC_AG_10_200_0.0.fasta 21.147736 sec 2 | ACC_GTT_10_200_0.0.fasta 21.771969 sec 3 | AAG_AG_10_200_0.0.fasta 21.427761 sec 4 | AAG_AGG_10_200_0.0.fasta 20.788939 sec 5 | AAAG_AG_10_200_0.0.fasta 19.791121 sec 6 | AAAG_AG_AAAG_10_200_0.0.fasta 21.229429 sec 7 | AAAG_AG_AGGG_AG_AAAG_10_200_0.0.fasta 23.324635 sec 8 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.0.fasta 27.386393 sec 9 | AC_AG_10_200_0.01.fasta 21.699499 sec 10 | ACC_GTT_10_200_0.01.fasta 22.978775 sec 11 | AAG_AG_10_200_0.01.fasta 22.058565 sec 12 | AAG_AGG_10_200_0.01.fasta 22.848272 sec 13 | AAAG_AG_10_200_0.01.fasta 22.830925 sec 14 | AAAG_AG_AAAG_10_200_0.01.fasta 24.864824 sec 15 | AAAG_AG_AGGG_AG_AAAG_10_200_0.01.fasta 30.533117 sec 16 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.01.fasta 51.646774 sec 17 | AC_AG_10_200_0.03.fasta 22.234692 sec 18 | ACC_GTT_10_200_0.03.fasta 23.793291 sec 19 | AAG_AG_10_200_0.03.fasta 27.605684 sec 20 | AAG_AGG_10_200_0.03.fasta 23.725142 sec 21 | AAAG_AG_10_200_0.03.fasta 23.259960 sec 22 | AAAG_AG_AAAG_10_200_0.03.fasta 26.080307 sec 23 | AAAG_AG_AGGG_AG_AAAG_10_200_0.03.fasta 34.167931 sec 24 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.03.fasta 61.479633 sec 25 | AC_AG_10_200_0.05.fasta 22.052540 sec 26 | ACC_GTT_10_200_0.05.fasta 24.236048 sec 27 | AAG_AG_10_200_0.05.fasta 22.805019 sec 28 | AAG_AGG_10_200_0.05.fasta 24.181370 sec 29 | AAAG_AG_10_200_0.05.fasta 24.580435 sec 30 | AAAG_AG_AAAG_10_200_0.05.fasta 29.060202 sec 31 | AAAG_AG_AGGG_AG_AAAG_10_200_0.05.fasta 40.593407 sec 32 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.05.fasta 79.609673 sec 33 | AC_AG_10_200_0.1.fasta 23.727283 sec 34 | ACC_GTT_10_200_0.1.fasta 27.058134 sec 35 | AAG_AG_10_200_0.1.fasta 25.016262 sec 36 | AAG_AGG_10_200_0.1.fasta 30.091669 sec 37 | AAAG_AG_10_200_0.1.fasta 26.549988 sec 38 | AAAG_AG_AAAG_10_200_0.1.fasta 34.315407 sec 39 | AAAG_AG_AGGG_AG_AAAG_10_200_0.1.fasta 54.932568 sec 40 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.1.fasta 125.210846 sec 41 | AC_AG_10_200_0.15.fasta 24.514071 sec 42 | ACC_GTT_10_200_0.15.fasta 25.871542 sec 43 | AAG_AG_10_200_0.15.fasta 23.046970 sec 44 | AAG_AGG_10_200_0.15.fasta 28.966978 sec 45 | AAAG_AG_10_200_0.15.fasta 28.361736 sec 46 | AAAG_AG_AAAG_10_200_0.15.fasta 39.555313 sec 47 | AAAG_AG_AGGG_AG_AAAG_10_200_0.15.fasta 67.106201 sec 48 | AGGGG_AAAAGAAAGAGAGGG_AGGGG_10_200_0.15.fasta 164.649063 sec 49 | -------------------------------------------------------------------------------- /smooth.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "uTR.h" 8 | 9 | int majority(int *blocks, int start, int end, int *unit_freq, int numKeyUnits){ 10 | for(int i=0; i> $acc_table_uTR #"Min Max num unit occ" 42 | $check_uTR -i $TR_uTR_decomp_file -a $allowance >> $acc_table_uTR 43 | 44 | # RepeatMasker 45 | parse_RM=$tmp"/"$run_name".fasta.result.txt" 46 | match_RM=$tmp"/"$run_name".fasta.match.txt" 47 | $checkRM -i $parse_RM -a $allowance -p > $match_RM 48 | echo -n -e $run_name" " >> $acc_table_RM 49 | tail -n1 $match_RM >> $acc_table_RM 50 | 51 | # TRF 52 | result_TRF=$tmp"/"$run_name"_TRF.txt" 53 | accuracy_TRF=$tmp"/"$run_name"_TRF_acc.txt" 54 | $checkTRF -i $result_TRF -a $allowance -o $accuracy_TRF 55 | echo -n -e $run_name" " >> $acc_table_TRF 56 | tail -n1 $accuracy_TRF >> $acc_table_TRF 57 | done 58 | done 59 | done 60 | done 61 | done 62 | 63 | exit 0 64 | -------------------------------------------------------------------------------- /Kawahara_nsop_Z.cpp: -------------------------------------------------------------------------------- 1 | // Riki Kawahara's algorithm that lists all non-self-overlapping substrings for a given string S in O(n^2)-time 2 | 3 | //#define DUMP_Kawahara_nsop_Z 4 | 5 | #ifndef DUMP_Kawahara_nsop_Z 6 | #include "uTR.h" 7 | #endif 8 | 9 | #include 10 | #include 11 | #include 12 | using namespace std; 13 | 14 | void dump_int_array(int *a, int len, string name){ 15 | cout << name << "\t"; 16 | for(int i=0; i 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "uTR.h" 9 | 10 | void put_qualified_read(Read *currentRead, int i){ 11 | int numReads, diameter, radius; 12 | // The annotation starts with a space " " !!! 13 | sscanf(currentRead->ID, " GroupSize = %d, Diameter = %d, RadiusFromCentroid = %d", &numReads, &diameter, &radius); 14 | Qreads[i].numReads = numReads; 15 | Qreads[i].len = currentRead->len; 16 | Qreads[i].numKeyUnits = currentRead->numKeyUnits; 17 | Qreads[i].mosaic_mode = currentRead->mosaic_mode; 18 | } 19 | 20 | int main(int argc, char *argv[]) 21 | { 22 | char inputFile[500]; // For the input file name 23 | char repUnit[1000]; // For storing a representative unit 24 | char outputFile[500]; // For the output file name 25 | int inputFile_given = 0; 26 | int repUnit_given = 0; 27 | int print_time = 0; 28 | int print_EDDC = 0; 29 | int opt; 30 | while ((opt = getopt(argc, argv, "f:u:o:t")) != -1) { 31 | switch(opt){ 32 | case 'f': 33 | strcpy(inputFile,optarg); inputFile_given = 1; break; 34 | case 'u': 35 | strcpy(repUnit, optarg); repUnit_given= 1; break; 36 | case 'o': 37 | strcpy(outputFile, optarg); print_EDDC = 1; break; 38 | case 't': 39 | print_time = 1; break; 40 | default: 41 | fprintf(stderr, "Usage: uTR -f (-u ) -o \n"); 42 | exit(EXIT_FAILURE); 43 | } 44 | } 45 | if(inputFile_given == 0){ 46 | fprintf(stderr, "Input file is not given.\n"); 47 | exit(EXIT_FAILURE); 48 | } 49 | 50 | struct timeval s, e; gettimeofday(&s, NULL); 51 | float time_get_non_self_overlapping_prefixes = time_coverage_by_units = time_set_cover_greedy = 0; 52 | 53 | // get non-self-overlapping units 54 | FILE *fp = init_handle_one_file(inputFile); 55 | Read *currentRead = malloc(sizeof(Read)); 56 | malloc_Units(); malloc_GlobalUnits(); 57 | for(int i=0;;i++){ 58 | return_one_read(fp, currentRead); 59 | if(currentRead->len == 0) break; 60 | // get non-self-overlapping units 61 | get_non_self_overlapping_prefixes(currentRead->string); 62 | } 63 | int nsop_unit_cnt = unit_cnt; 64 | fclose(fp); 65 | free_Units(); 66 | free_GlobalUnits(); 67 | 68 | // get all substrings of length <= MAX_UNIT_LENGTH 69 | fp = init_handle_one_file(inputFile); 70 | currentRead = malloc(sizeof(Read)); 71 | malloc_Units(); malloc_GlobalUnits(); 72 | for(int i=0;;i++){ 73 | return_one_read(fp, currentRead); 74 | if(currentRead->len == 0) break; 75 | // get all substrings of length <= MAX_UNIT_LENGTH 76 | put_all_substrings(currentRead->string); 77 | } 78 | int all_unit_cnt = unit_cnt; 79 | fclose(fp); 80 | free_Units(); 81 | free_GlobalUnits(); 82 | 83 | printf("\t%f\t%d\t%d\n", (float)nsop_unit_cnt/all_unit_cnt, nsop_unit_cnt, all_unit_cnt); 84 | 85 | return EXIT_SUCCESS; 86 | } 87 | -------------------------------------------------------------------------------- /test_public/README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | For comparing uTR, RepeatMasker, and TRF using synthetic datasets in terms of accuracy of predicting mosaic tandem repeat (TR) patterns, first compile all codes by executing: 4 | 5 | bash make.sh 6 | 7 | To compare uTR, RepeatMasker, and TRF, in terms of accuracy and wall clock time, execute: 8 | 9 | bash test.sh 10 | 11 | The above program creates a test mosaic TR data set consisting of widely different units that are typically present in the human genome: 12 | 13 | (AC)i(AG)j (ACC)i(GTT)j (AAG)i(AG)j (AAG)i(AGG)j (AAAG)i(AG)j 14 | (AAAG)i(AG)j(AAAG)k (AAAG)i(AG)j(AGGG)k(AG)l(AAAG)m 15 | (AGGGG)i(AAAAGAAAGAGAGGG)j(AGGGG)k 16 | 17 | The variable (e.g., i, j, k, l, m) next to each unit in parentheses represents the number of unit occurrences. Mosaic TRs are harder to decompose when different units are more similar, and more units are present. 18 | To understand the hardness of the decomposition, we generate a variety of datasets of different average lengths for each mosaic TR pattern; i.e., variables in each pattern are set to random values ranging from 10 to 200. 19 | The program in the following directory generates these datasets. 20 | 21 | gendata/ 22 | 23 | To see how sequencing errors affect the prediction of the original mosaic TR patterns, letters of strings in each dataset are modified at random by sequencing errors (substitutions, insertions, and deletions) at the rate of 0%, 1%, 3%, 5%, 10%, and 15%. 24 | Precisely, for example, when a mosaic TR has three units (U)i(V)j(W)k, all of the three units need to be predicted nearly correctly, and a series of units (U)i is accurate if the value of i differs by at most X% (e.g., 0%, 1%, 2%, and 3%) of the true value, where we call X an allowance. 25 | Accuracy increases by setting allowance X to a larger value, and this mitigation is reasonable and necessary when dealing with two homologous units (i.e., AAG and AG) because it becomes ambiguous to correctly determine the boundary between two similar units in the presence of sequencing errors. 26 | 27 | The program compares the prediction accuracy of uTR with TRF (Version 4.09) and RepeatMasker (version open-4.0.7). 28 | The program uses RepeatMasker with default parameter settings (-e hmmer -noint -pa 4 -div 0 -xsmall) and TRF with default parameter settings except for lowering the minimum alignment score from 50 to 10 (i.e., 2 7 7 80 10 10 1000 -h -ngs) in order to detect small TRs with 10 or more units in our benchmark datasets. 29 | TRF sometimes returns a single most likely mosaic TR, but it often outputs a number of tandem repeats some of which overlap each other. 30 | To find a mosaic TR, a series of non-overlapping TRs has to be selected, which is actually solved by RepeatMasker. 31 | Therefore, RepeatMasker seems to be better suited to detect mosaic TRs than TRFs. 32 | 33 | For each of eight mosaic TR patterns, the program considers six sequencing error rates, and creates a total of 48 ($=8 \times 6$) datasets with 1000 strings. 34 | Programs in the following subdirectories evaluate the accuracy of uTR, RepeatMasker, and TRF in terms of the given allowance. 35 | 36 | check_uTR/ 37 | parse_RepeatMasker/ 38 | check_RepeatMasker/ 39 | checkTRF/ 40 | 41 | When tool T (uTR, RM, or TRF) is used with allowance X (0%, 1%, 2%, or 3%), the accuracy table named 42 | 43 | tmp/accuracy_T_allowanceX.txt 44 | 45 | is generated in the directory tmp. 46 | For example, the top five lines of the file 47 | 48 | tmp/accuracy_uTR_allowance0.02.txt 49 | 50 | are: 51 | 52 | AC_AG_10_200_0.0 992 1000 414 53 | ACC_GTT_10_200_0.0 998 1000 617 54 | AAG_AG_10_200_0.0 991 1000 524 55 | AAG_AGG_10_200_0.0 998 1000 617 56 | AAAG_AG_10_200_0.0 982 1000 624 57 | 58 | The first row means that of 1000 mosaic TR pattern (AC)i(AG)j, where the values of i and j are selected from 10 to 200 at random, 992 are predicted correctly by uTR when the allowance is set to 0.02, and the average length is 414. 59 | In the directory tmp, the Excel table named 60 | 61 | tmp/accuracy_time.xlsx 62 | 63 | summaries the accuracy and wall clock time. 64 | In most cases, uTR outperformed RepeatMasker and TRF in terms of prediction accuracy, and this is especially true when mosaic TRs have three or more series of units. 65 | Prediction accuracy of uTR, RepeatMasker, and TRF tends to decrease as the sequencing error rate increases because sequencing errors obscure the original unit patterns and make prediction difficult. 66 | -------------------------------------------------------------------------------- /nsop_test/gendata/main.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "MT.h" // Use the Mersenne Twister. 9 | 10 | #define NUM_MOSAIC_TRS 1000 11 | #define ERROR_RATE 0 // 0.05 12 | 13 | #define MAX_UNIT_OCC 20 14 | #define MIN_UNIT_OCC 10 15 | 16 | #define ST_LEN 100000 17 | 18 | //#define DEBUG_errors 19 | 20 | void print_units(char *st, char *unit, int n){ 21 | for(int i=0; i "); 121 | for (int i=0; i%d", argv[i + start_units], randNums[i]); 124 | printf("\n"); 125 | //printf(" discrepancy=%3.2f\n", discrepancy); 126 | printf("%s\n", st); 127 | j++; 128 | } 129 | 130 | free(randNums); 131 | free(st); 132 | return EXIT_SUCCESS; 133 | } 134 | -------------------------------------------------------------------------------- /test_public/gendata/main.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "MT.h" // Use the Mersenne Twister. 9 | 10 | #define NUM_MOSAIC_TRS 1000 11 | #define ERROR_RATE 0 // 0.05 12 | 13 | #define MAX_UNIT_OCC 20 14 | #define MIN_UNIT_OCC 10 15 | 16 | #define ST_LEN 100000 17 | 18 | //#define DEBUG_errors 19 | 20 | void print_units(char *st, char *unit, int n){ 21 | for(int i=0; i "); 121 | for (int i=0; i%d", argv[i + start_units], randNums[i]); 124 | printf("\n"); 125 | //printf(" discrepancy=%3.2f\n", discrepancy); 126 | printf("%s\n", st); 127 | j++; 128 | } 129 | 130 | free(randNums); 131 | free(st); 132 | return EXIT_SUCCESS; 133 | } 134 | -------------------------------------------------------------------------------- /test_public/test_gendata_decompose.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # executable modules 3 | uTR=../uTR 4 | uTR_param="-stda" 5 | 6 | gen_units=gendata/gen 7 | check_uTR=check_uTR/acc 8 | checkTRF=checkTRF/checkTRF 9 | tmp=tmp 10 | mkdir $tmp 11 | num_TRs=1000 12 | 13 | listError=(0.0 0.01 0.03 0.05 0.1 0.15) 14 | 15 | listPattern=("AC_AG" "ACC_GTT" "AAG_AG" "AAG_AGG" "AAAG_AG" "AAAG_AG_AAAG" "AAAG_AG_AGGG_AG_AAAG" "AGGGG_AAAAGAAAGAGAGGG_AGGGG") 16 | 17 | wallclock_uTR="time_uTR.txt" 18 | wallclock_RM="time_RM.txt" 19 | wallclock_TRF="time_TRF.txt" 20 | 21 | acc_table_uTR="accuracy_uTR.txt" 22 | acc_table_RM="accuracy_RM.txt" 23 | acc_table_TRF="accuracy_TRF.txt" 24 | 25 | rm $wallclock_uTR $wallclock_RM $wallclock_TRF 26 | rm $acc_table_uTR $acc_table_RM $acc_table_TRF 27 | 28 | for error_ratio in ${listError[@]} 29 | do 30 | for units_name in ${listPattern[@]} 31 | do 32 | units=${units_name//\_/ } 33 | min_a=(10) 34 | max_a=(200) 35 | for min_unit_occ in ${min_a[@]} 36 | do 37 | for max_unit_occ in ${max_a[@]} 38 | do 39 | run_name=$units_name"_"$min_unit_occ"_"$max_unit_occ"_"$error_ratio 40 | echo $run_name 41 | 42 | # Generate data of tandem repeats for the given pattern 43 | TR_file=$run_name".fasta" 44 | res="${units_name//[^_]}" 45 | numUnits=$(( ${#res}+1 )) 46 | $gen_units -k $min_unit_occ -l $max_unit_occ -n $num_TRs -e $error_ratio -m $numUnits $units > $TR_file 47 | 48 | # Run uTR 49 | TR_uTR_decomp_file=$run_name"_decomp_uTR.fasta" 50 | uTR_stat=$run_name"_unit_stat_uTR.txt" 51 | $uTR -f $TR_file $uTR_param -o $TR_uTR_decomp_file 1>> $uTR_stat 2>> $wallclock_uTR 52 | rm $uTR_stat 53 | # Compute the accuracy of uTR 54 | echo -n -e $run_name" " >> $acc_table_uTR 55 | $check_uTR -i $TR_uTR_decomp_file >> $acc_table_uTR 56 | 57 | # Run RepeatMasker 58 | parse_RM=$run_name".fasta.result.txt" 59 | match_RM=$run_name".fasta.match.txt" 60 | # Apply RepeatMasker to the string set 61 | SECONDS=0 62 | repeatmasker -e hmmer -noint -pa 4 -div 0 -xsmall $TR_file &> progress_report.txt 63 | run_time=$SECONDS 64 | echo -n -e $run_name" " >> $wallclock_RM 65 | echo $run_time >> $wallclock_RM 66 | #repeatmasker -e hmmer -noint -pa 4 -div 20 -xsmall $TR_file &> progress_report.txt 67 | # -e(ngine) [crossmatch|wublast|abblast|ncbi|rmblast|hmmer] 68 | # -noint Only masks low complex/simple repeats (no interspersed repeats) 69 | # -pa 4 The number of sequence batch jobs [50kb minimum] to run in parallel. 70 | # -div [number] Masks only those repeats < x percent diverged from consensus seq 71 | # -xsmall Returns repetitive regions in lowercase (rest capitals) rather than masked 72 | parse_RepeatMasker/parseRM -i $TR_file.out -o $parse_RM 73 | # Parse a repeatmasker .out file and output the result 74 | # Remove temporary files 75 | rm $TR_file.cat 76 | rm $TR_file.tbl 77 | rm $TR_file.out 78 | rm $TR_file.masked 79 | rm progress_report.txt 80 | # Parse the file to determine how RepeatMasker could identify mosaic tandem repeats 81 | check_RepeatMasker/checkRM -i $parse_RM -p > $match_RM 82 | # Print out the statistics 83 | echo -n -e $run_name" " >> $acc_table_RM 84 | tail -n1 $match_RM >> $acc_table_RM 85 | 86 | # Run TRF 87 | result_TRF=$run_name"_TRF.txt" 88 | accuracy_TRF=$run_name"_TRF_acc.txt" 89 | # Apply TRF to the string set 90 | SECONDS=0 91 | trf $TR_file 2 7 7 80 10 10 1000 -h -ngs > $result_TRF 92 | #trf $TR_file 2 7 7 80 10 50 500 -h -ngs > $result_TRF 93 | run_time=$SECONDS 94 | echo -n -e $run_name" " >> $wallclock_TRF 95 | echo $run_time >> $wallclock_TRF 96 | $checkTRF -i $result_TRF -o $accuracy_TRF 97 | # Print out the statistics 98 | echo -n -e $run_name" " >> $acc_table_TRF 99 | tail -n1 $accuracy_TRF >> $acc_table_TRF 100 | 101 | mv $TR_file $tmp 102 | mv $TR_uTR_decomp_file $tmp 103 | mv $parse_RM $tmp 104 | mv $match_RM $tmp 105 | mv $result_TRF $tmp 106 | mv $accuracy_TRF $tmp 107 | done 108 | done 109 | done 110 | done 111 | 112 | mv $wallclock_uTR $tmp 113 | mv $wallclock_RM $tmp 114 | mv $wallclock_TRF $tmp 115 | mv $acc_table_uTR $tmp 116 | mv $acc_table_RM $tmp 117 | mv $acc_table_TRF $tmp 118 | 119 | exit 0 120 | -------------------------------------------------------------------------------- /wrap_around_DP.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "uTR.h" 8 | 9 | //#define DEBUG_wrap_around_DP 10 | int wrap_around_DP(char *a_rep_unit, char *a_rep, int *t_mat, int *t_mis, int *t_ins, int *t_del){ // wraparound DP for handling a single unit 11 | 12 | int unit_len, rep_len; 13 | for(unit_len=0; a_rep_unit[unit_len] != '\0'; unit_len++){} 14 | for(rep_len=0; a_rep[rep_len] != '\0'; rep_len++){} 15 | 16 | // Move 0-origin to 1-origin 17 | char *rep_unit = (char *) malloc(sizeof(char) * (unit_len+2)); 18 | char *rep = (char *) malloc(sizeof(char) * (rep_len +2)); 19 | for(int i=0; i 1){ 46 | val_deletion = WrapDP[next*i + j-1] - INDEL_PENALTY; 47 | WrapDP[next*i + j] = MAX(0, MAX( MAX( val_mismatch, val_insertion), val_deletion)); 48 | }else{ 49 | WrapDP[next*i + j] = MAX(0, MAX( val_mismatch, val_insertion)); 50 | } 51 | } 52 | if(max_wrd < WrapDP[next*i + j]) 53 | { 54 | max_wrd = WrapDP[next*i + j]; 55 | max_i = i; 56 | max_j = j; 57 | } 58 | } 59 | // wrap around 60 | WrapDP[next*i + 0] = WrapDP[next*i + unit_len]; 61 | } 62 | #ifdef DEBUG_wrap_around_DP 63 | fprintf(stderr, "%s\t%s\n", rep_unit, rep); 64 | fprintf(stderr, "max_wrd=%d\tmax_i=%d\tmax_j=%d\n", max_wrd, max_i, max_j); 65 | #endif 66 | 67 | // trace back the optimal alignment while storing it in the data structure "alignment" 68 | int Num_matches = 0; 69 | int Num_mismatches = 0; 70 | int Num_insertions = 0; 71 | int Num_deletions = 0; 72 | int Num_scanned_unit = 0; 73 | 74 | i = max_i; 75 | j = max_j; 76 | if(j == 0){ j = unit_len; } // 1-origin index 77 | int answer; 78 | //answer = max_j % unit_len; // When max_j+1 be the begin of the repeat unit according to the 1-based indexing, max_j is the begin in the 0-based indexing. 79 | 80 | while(i > 0 && WrapDP[next*i + j] > 0){ // global alignment 81 | val_match = WrapDP[next*(i-1) + j-1] + MATCH_GAIN; 82 | val_mismatch = WrapDP[next*(i-1) + j-1] - MISMATCH_PENALTY; 83 | val_insertion = WrapDP[next*(i-1) + j] - INDEL_PENALTY; 84 | val_deletion = WrapDP[next*i + j-1] - INDEL_PENALTY; 85 | 86 | if( max_wrd == val_match && rep[i] == rep_unit[j]){ 87 | max_wrd -= MATCH_GAIN; 88 | i--; j--; 89 | Num_matches++; 90 | Num_scanned_unit++; 91 | }else if( max_wrd == val_mismatch && rep[i] != rep_unit[j]){ // mismatch 92 | max_wrd += MISMATCH_PENALTY; 93 | i--; j--; 94 | Num_mismatches++; 95 | Num_scanned_unit++; 96 | }else if( max_wrd == val_deletion){ // deletion 97 | max_wrd += INDEL_PENALTY; 98 | j--; 99 | Num_deletions++; // Num_insertions++; 100 | Num_scanned_unit++; 101 | }else if( max_wrd == val_insertion){ // insertion 102 | max_wrd += INDEL_PENALTY; 103 | i--; 104 | Num_insertions++; 105 | //Num_scanned_unit++; // The base of the repeat unit is skipped. 106 | }else if( max_wrd == 0){ 107 | break; 108 | }else{ 109 | fprintf(stderr, "fatal error in wrap-around DP max_wrd = %i\n", max_wrd); 110 | exit(EXIT_FAILURE); 111 | } 112 | if(j == 0){ 113 | j = unit_len; 114 | } 115 | } 116 | *t_mat = Num_matches; 117 | *t_mis = Num_mismatches; 118 | *t_ins = Num_insertions; 119 | *t_del = Num_deletions; 120 | free(rep_unit); 121 | free(rep); 122 | answer = j % unit_len; // When j+1 be the begin of the repeat unit according to the 1-based indexing, j%unit_len is the begin in the 0-based indexing. 123 | return(answer); 124 | } 125 | -------------------------------------------------------------------------------- /nsop_test/uTR/uTR.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019, Shinichi Morishita 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | The views and conclusions contained in the software and documentation are those 26 | of the authors and should not be interpreted as representing official policies, 27 | either expressed or implied, of the FreeBSD Project. 28 | */ 29 | 30 | #include 31 | 32 | // Key default parameters 33 | #define MIN_unit_occupancy_ratio 0.8 34 | #define MIN_unit_length 2 35 | #define TOP_k_units 10 // Print top k units for debugging 36 | 37 | // Internal variables and data structures 38 | #define MAX_NUMBER_READS 1000 39 | #define MAX_READ_LENGTH 20000 40 | #define MAX_ID_LENGTH 10000 41 | #define BLK 4096 42 | #define MAX_NUMBER_UNITS 3000000 43 | #define MAX_UNIT_LENGTH 20 // 100 44 | int repUnitLen; 45 | int read_cnt; 46 | char *nextReadID; 47 | 48 | // Parameters for processing long units 49 | #define LONG_UNIT_LEN_TH 20 //20 50 | #define WINDOW_LEN 5 //5 51 | #define MAX_DIS_RATIO 0.05 // The maximum discrepancy ratio 52 | 53 | // Parameters for controling bounded alignment among long units 54 | #define MATCH 1 55 | #define MISMATCH -1 56 | #define INDEL -1 57 | #define Mosaic_tandem_repeat 0 58 | #define Mosaic_repeat 1 59 | 60 | typedef struct{ 61 | int len; 62 | char string[MAX_READ_LENGTH]; 63 | char ID[MAX_ID_LENGTH]; 64 | int numKeyUnits; 65 | int mosaic_mode; //Mosaic_tandem_repeat or Mosaic_repeat 66 | } Read; 67 | 68 | typedef struct{ 69 | int len; 70 | int numReads; 71 | int numKeyUnits; 72 | int mosaic_mode; //Mosaic_tandem_repeat or Mosaic_repeat 73 | } QualifiedRead; 74 | QualifiedRead *Qreads; 75 | 76 | typedef struct{ 77 | char *string; // MAX_UNIT_LENGTH 78 | int ID; // quadratic number 79 | int len; 80 | int sumOccurrences; 81 | int prio; 82 | int *covered; // MAX_READ_LENGTH 83 | } Unit; // The priority 1,2,... in the set of mosaic repeat units. -1 if the unit is not in the set. 84 | Unit *Units; 85 | int unit_cnt; 86 | 87 | Unit *GlobalUnits; 88 | int global_unit_cnt; 89 | 90 | FILE* init_handle_one_file(char *inputFile); 91 | void return_one_read(FILE *fp, Read *currentRead); 92 | 93 | void malloc_Units(); 94 | void free_Units(); 95 | void clear_Units_incrementally(); 96 | 97 | void malloc_GlobalUnits(); 98 | void free_GlobalUnits(); 99 | void put_into_GlobalUnits(char *tmpUnit); 100 | void print_GlobalUnits(); 101 | 102 | char int2char(int i); 103 | void put_repUnit(char *repUnit); 104 | 105 | void SA_IS(unsigned char *s, int *SA, int n, int K, int cs); 106 | 107 | void coverage_by_units(char *S, int MIN_number_repetitions); 108 | int set_cover_greedy(FILE *ofp, char *S, int MIN_number_repetitions); 109 | 110 | 111 | // Interface between C and C++ functions 112 | #ifndef __CSUB_H__ 113 | #define __CSUB_H__ 1 114 | #ifdef __cplusplus 115 | extern "C" { 116 | #endif /* __cplusplus */ 117 | // C function called from a C++ function 118 | extern void put_unit(char *unit); 119 | extern void retain_top_k_units(int topK); 120 | extern int char2int(char c); 121 | extern void match_bounded_DP_traceback(char *s0, int n0, char *s1, int n1, int *covered); 122 | // C++ function called from a C function 123 | extern void get_non_self_overlapping_prefixes(char *aString); 124 | extern void put_all_substrings(char *aS); 125 | extern int lzp(char *aString); 126 | extern void count_occurrences_long_unit(char *S, int n, int *SA, int *C, int **OCC, char *unit, int unitLen, int *tmpCovered, int MIN_number_repetitions ); 127 | #ifdef __cplusplus 128 | } 129 | #endif /* __cplusplus */ 130 | #endif /* __CSUB_H__ */ 131 | 132 | 133 | // External functions 134 | #define MIN(a, b) ((a) < (b) ? (a) : (b)) 135 | #define MAX(a, b) ((a) > (b) ? (a) : (b)) 136 | #define DIFF(x, y) ((x) > (y) ? ((x) - (y)) : ((y) - (x))) 137 | 138 | // Debug Mode 139 | //#define DEBUG_EDDC_mode // Show the frequency of each init. 140 | 141 | 142 | float time_get_non_self_overlapping_prefixes, time_coverage_by_units, time_set_cover_greedy; 143 | 144 | 145 | -------------------------------------------------------------------------------- /test_public/check_RepeatMasker/main.c: -------------------------------------------------------------------------------- 1 | // rTR: Retrieve TRs from a RepeatMasker output file 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #define BLK 4096 12 | #define OCC_DIFF 1 13 | #define DEBUG 14 | 15 | void rotate(int start, char *s1, int n1, char *rotated_s1){ 16 | for(int i=0; i 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #define BLK 4096 12 | #define largeBLK 40000 13 | #define OCC_DIFF 1 14 | #define numUnits 500 15 | #define maxUnitLen 200 16 | 17 | void rotate(int start, char *s1, int n1, char *rotated_s1){ 18 | for(int i=0; i 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define BLK 100000 10 | #define OCC_DIFF 1 11 | 12 | //#define DEBUG 13 | 14 | void rotate(int start, char *s1, int n1, char *rotated_s1){ 15 | for(int i=0; i with ( and ), respectively. 111 | for(int j=0; j' || s1_pat[j]==')' ) s1_pat[j]=' '; 113 | for(int j=0; j' || s2_pat[j]==')' ) s2_pat[j]=' '; 115 | } 116 | 117 | //#define DEBUG_main 118 | int main(int argc, char *argv[]) 119 | { 120 | int opt; 121 | char inputFile[1000]; 122 | float allowance = 0.1; // default 123 | while ((opt = getopt(argc, argv, "i:a:")) != -1) { 124 | switch(opt){ 125 | case 'i': 126 | strcpy(inputFile, optarg); break; 127 | case 'a': 128 | sscanf(optarg, "%f", &allowance); break; 129 | default: 130 | fprintf(stderr, "Usage: -i (file name) -a allowance (e.g., 0.01 by default)\n"); 131 | exit(EXIT_FAILURE); 132 | } 133 | } 134 | FILE *fp = fopen(inputFile, "r"); 135 | 136 | // Input fasta file begins with the form: 137 | // > (60,1,0.00) 1212 (AAG)12(AG)12 138 | 139 | char *s = (char *) malloc( sizeof(char) * BLK ); 140 | char *s1_pat= (char *) malloc( sizeof(char) * BLK ); 141 | char *s2_pat= (char *) malloc( sizeof(char) * BLK ); 142 | 143 | int i=0, j=0, sum_len=0; 144 | for(; ;){ 145 | if( fgets(s, BLK, fp) != NULL){ 146 | if(s[0] == '>'){ 147 | sscanf(s, "> %[^\0]", s); 148 | strcpy(s1_pat, ""); strcpy(s2_pat, ""); 149 | ID2patterns(s, s1_pat, s2_pat); 150 | #ifdef DEBUG_main 151 | printf("%s\t%s\t%s\n", s, s1_pat, s2_pat); 152 | #endif 153 | if(strcmp(s1_pat,"") != 0 && strcmp(s2_pat,"") != 0 ){ 154 | i++; 155 | if(pattern_cmp(s1_pat,s2_pat,allowance) == 1) j++; 156 | } 157 | }else{ 158 | int k; 159 | for(k=0; s[k]!='\0'; k++); 160 | sum_len += k; 161 | } 162 | }else{ 163 | break; 164 | } 165 | } 166 | printf("%d %d %d\n", j, i, sum_len/i); 167 | 168 | free(s); free(s1_pat); free(s2_pat); 169 | return EXIT_SUCCESS; 170 | } 171 | -------------------------------------------------------------------------------- /MT.h: -------------------------------------------------------------------------------- 1 | /* 2 | A C-program for MT19937, with initialization improved 2002/1/26. 3 | Coded by Takuji Nishimura and Makoto Matsumoto. 4 | 5 | Before using, initialize the state by using init_genrand(seed) 6 | or init_by_array(init_key, key_length). 7 | 8 | Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, 9 | All rights reserved. 10 | 11 | Redistribution and use in source and binary forms, with or without 12 | modification, are permitted provided that the following conditions 13 | are met: 14 | 15 | 1. Redistributions of source code must retain the above copyright 16 | notice, this list of conditions and the following disclaimer. 17 | 18 | 2. Redistributions in binary form must reproduce the above copyright 19 | notice, this list of conditions and the following disclaimer in the 20 | documentation and/or other materials provided with the distribution. 21 | 22 | 3. The names of its contributors may not be used to endorse or promote 23 | products derived from this software without specific prior written 24 | permission. 25 | 26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 27 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 28 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 29 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 30 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 31 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 32 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 33 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 34 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 35 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | 38 | 39 | Any feedback is very welcome. 40 | http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html 41 | email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space) 42 | */ 43 | 44 | /* 45 | The original version of http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/MT2002/CODES/mt19937ar.c was modified by Takahiro Omi as 46 | - delete line 47 "#include" 47 | - delete line 174 int main(void){...} 48 | - change N -> MT_N 49 | - change N -> MT_N 50 | - change the file name "mt19937ar.c" -> "MT.h" 51 | */ 52 | 53 | 54 | /* Period parameters */ 55 | #define MT_N 624 56 | #define MT_M 397 57 | #define MATRIX_A 0x9908b0dfUL /* constant vector a */ 58 | #define UPPER_MASK 0x80000000UL /* most significant w-r bits */ 59 | #define LOWER_MASK 0x7fffffffUL /* least significant r bits */ 60 | 61 | static unsigned long mt[MT_N]; /* the array for the state vector */ 62 | static int mti=MT_N+1; /* mti==MT_N+1 means mt[MT_N] is not initialized */ 63 | 64 | /* initializes mt[MT_N] with a seed */ 65 | void init_genrand(unsigned long s) 66 | { 67 | mt[0]= s & 0xffffffffUL; 68 | for (mti=1; mti> 30)) + mti); 71 | /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */ 72 | /* In the previous versions, MSBs of the seed affect */ 73 | /* only MSBs of the array mt[]. */ 74 | /* 2002/01/09 modified by Makoto Matsumoto */ 75 | mt[mti] &= 0xffffffffUL; 76 | /* for >32 bit machines */ 77 | } 78 | } 79 | 80 | /* initialize by an array with array-length */ 81 | /* init_key is the array for initializing keys */ 82 | /* key_length is its length */ 83 | /* slight change for C++, 2004/2/26 */ 84 | void init_by_array(unsigned long init_key[], int key_length) 85 | { 86 | int i, j, k; 87 | init_genrand(19650218UL); 88 | i=1; j=0; 89 | k = (MT_N>key_length ? MT_N : key_length); 90 | for (; k; k--) { 91 | mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL)) 92 | + init_key[j] + j; /* non linear */ 93 | mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */ 94 | i++; j++; 95 | if (i>=MT_N) { mt[0] = mt[MT_N-1]; i=1; } 96 | if (j>=key_length) j=0; 97 | } 98 | for (k=MT_N-1; k; k--) { 99 | mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL)) 100 | - i; /* non linear */ 101 | mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */ 102 | i++; 103 | if (i>=MT_N) { mt[0] = mt[MT_N-1]; i=1; } 104 | } 105 | 106 | mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */ 107 | } 108 | 109 | /* generates a random number on [0,0xffffffff]-interval */ 110 | unsigned long genrand_int32(void) 111 | { 112 | unsigned long y; 113 | static unsigned long mag01[2]={0x0UL, MATRIX_A}; 114 | /* mag01[x] = x * MATRIX_A for x=0,1 */ 115 | 116 | if (mti >= MT_N) { /* generate N words at one time */ 117 | int kk; 118 | 119 | if (mti == MT_N+1) /* if init_genrand() has not been called, */ 120 | init_genrand(5489UL); /* a default initial seed is used */ 121 | 122 | for (kk=0;kk> 1) ^ mag01[y & 0x1UL]; 125 | } 126 | for (;kk> 1) ^ mag01[y & 0x1UL]; 129 | } 130 | y = (mt[MT_N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK); 131 | mt[MT_N-1] = mt[MT_M-1] ^ (y >> 1) ^ mag01[y & 0x1UL]; 132 | 133 | mti = 0; 134 | } 135 | 136 | y = mt[mti++]; 137 | 138 | /* Tempering */ 139 | y ^= (y >> 11); 140 | y ^= (y << 7) & 0x9d2c5680UL; 141 | y ^= (y << 15) & 0xefc60000UL; 142 | y ^= (y >> 18); 143 | 144 | return y; 145 | } 146 | 147 | /* generates a random number on [0,0x7fffffff]-interval */ 148 | long genrand_int31(void) 149 | { 150 | return (long)(genrand_int32()>>1); 151 | } 152 | 153 | /* generates a random number on [0,1]-real-interval */ 154 | double genrand_real1(void) 155 | { 156 | return genrand_int32()*(1.0/4294967295.0); 157 | /* divided by 2^32-1 */ 158 | } 159 | 160 | /* generates a random number on [0,1)-real-interval */ 161 | double genrand_real2(void) 162 | { 163 | return genrand_int32()*(1.0/4294967296.0); 164 | /* divided by 2^32 */ 165 | } 166 | 167 | /* generates a random number on (0,1)-real-interval */ 168 | double genrand_real3(void) 169 | { 170 | return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0); 171 | /* divided by 2^32 */ 172 | } 173 | 174 | /* generates a random number on [0,1) with 53-bit resolution*/ 175 | double genrand_res53(void) 176 | { 177 | unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6; 178 | return(a*67108864.0+b)*(1.0/9007199254740992.0); 179 | } 180 | /* These real versions are due to Isaku Wada, 2002/01/09 added */ 181 | -------------------------------------------------------------------------------- /nsop_test/gendata/MT.h: -------------------------------------------------------------------------------- 1 | /* 2 | A C-program for MT19937, with initialization improved 2002/1/26. 3 | Coded by Takuji Nishimura and Makoto Matsumoto. 4 | 5 | Before using, initialize the state by using init_genrand(seed) 6 | or init_by_array(init_key, key_length). 7 | 8 | Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, 9 | All rights reserved. 10 | 11 | Redistribution and use in source and binary forms, with or without 12 | modification, are permitted provided that the following conditions 13 | are met: 14 | 15 | 1. Redistributions of source code must retain the above copyright 16 | notice, this list of conditions and the following disclaimer. 17 | 18 | 2. Redistributions in binary form must reproduce the above copyright 19 | notice, this list of conditions and the following disclaimer in the 20 | documentation and/or other materials provided with the distribution. 21 | 22 | 3. The names of its contributors may not be used to endorse or promote 23 | products derived from this software without specific prior written 24 | permission. 25 | 26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 27 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 28 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 29 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 30 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 31 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 32 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 33 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 34 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 35 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | 38 | 39 | Any feedback is very welcome. 40 | http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html 41 | email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space) 42 | */ 43 | 44 | /* 45 | The original version of http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/MT2002/CODES/mt19937ar.c was modified by Takahiro Omi as 46 | - delete line 47 "#include" 47 | - delete line 174 int main(void){...} 48 | - change N -> MT_N 49 | - change N -> MT_N 50 | - change the file name "mt19937ar.c" -> "MT.h" 51 | */ 52 | 53 | 54 | /* Period parameters */ 55 | #define MT_N 624 56 | #define MT_M 397 57 | #define MATRIX_A 0x9908b0dfUL /* constant vector a */ 58 | #define UPPER_MASK 0x80000000UL /* most significant w-r bits */ 59 | #define LOWER_MASK 0x7fffffffUL /* least significant r bits */ 60 | 61 | static unsigned long mt[MT_N]; /* the array for the state vector */ 62 | static int mti=MT_N+1; /* mti==MT_N+1 means mt[MT_N] is not initialized */ 63 | 64 | /* initializes mt[MT_N] with a seed */ 65 | void init_genrand(unsigned long s) 66 | { 67 | mt[0]= s & 0xffffffffUL; 68 | for (mti=1; mti> 30)) + mti); 71 | /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */ 72 | /* In the previous versions, MSBs of the seed affect */ 73 | /* only MSBs of the array mt[]. */ 74 | /* 2002/01/09 modified by Makoto Matsumoto */ 75 | mt[mti] &= 0xffffffffUL; 76 | /* for >32 bit machines */ 77 | } 78 | } 79 | 80 | /* initialize by an array with array-length */ 81 | /* init_key is the array for initializing keys */ 82 | /* key_length is its length */ 83 | /* slight change for C++, 2004/2/26 */ 84 | void init_by_array(unsigned long init_key[], int key_length) 85 | { 86 | int i, j, k; 87 | init_genrand(19650218UL); 88 | i=1; j=0; 89 | k = (MT_N>key_length ? MT_N : key_length); 90 | for (; k; k--) { 91 | mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL)) 92 | + init_key[j] + j; /* non linear */ 93 | mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */ 94 | i++; j++; 95 | if (i>=MT_N) { mt[0] = mt[MT_N-1]; i=1; } 96 | if (j>=key_length) j=0; 97 | } 98 | for (k=MT_N-1; k; k--) { 99 | mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL)) 100 | - i; /* non linear */ 101 | mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */ 102 | i++; 103 | if (i>=MT_N) { mt[0] = mt[MT_N-1]; i=1; } 104 | } 105 | 106 | mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */ 107 | } 108 | 109 | /* generates a random number on [0,0xffffffff]-interval */ 110 | unsigned long genrand_int32(void) 111 | { 112 | unsigned long y; 113 | static unsigned long mag01[2]={0x0UL, MATRIX_A}; 114 | /* mag01[x] = x * MATRIX_A for x=0,1 */ 115 | 116 | if (mti >= MT_N) { /* generate N words at one time */ 117 | int kk; 118 | 119 | if (mti == MT_N+1) /* if init_genrand() has not been called, */ 120 | init_genrand(5489UL); /* a default initial seed is used */ 121 | 122 | for (kk=0;kk> 1) ^ mag01[y & 0x1UL]; 125 | } 126 | for (;kk> 1) ^ mag01[y & 0x1UL]; 129 | } 130 | y = (mt[MT_N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK); 131 | mt[MT_N-1] = mt[MT_M-1] ^ (y >> 1) ^ mag01[y & 0x1UL]; 132 | 133 | mti = 0; 134 | } 135 | 136 | y = mt[mti++]; 137 | 138 | /* Tempering */ 139 | y ^= (y >> 11); 140 | y ^= (y << 7) & 0x9d2c5680UL; 141 | y ^= (y << 15) & 0xefc60000UL; 142 | y ^= (y >> 18); 143 | 144 | return y; 145 | } 146 | 147 | /* generates a random number on [0,0x7fffffff]-interval */ 148 | long genrand_int31(void) 149 | { 150 | return (long)(genrand_int32()>>1); 151 | } 152 | 153 | /* generates a random number on [0,1]-real-interval */ 154 | double genrand_real1(void) 155 | { 156 | return genrand_int32()*(1.0/4294967295.0); 157 | /* divided by 2^32-1 */ 158 | } 159 | 160 | /* generates a random number on [0,1)-real-interval */ 161 | double genrand_real2(void) 162 | { 163 | return genrand_int32()*(1.0/4294967296.0); 164 | /* divided by 2^32 */ 165 | } 166 | 167 | /* generates a random number on (0,1)-real-interval */ 168 | double genrand_real3(void) 169 | { 170 | return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0); 171 | /* divided by 2^32 */ 172 | } 173 | 174 | /* generates a random number on [0,1) with 53-bit resolution*/ 175 | double genrand_res53(void) 176 | { 177 | unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6; 178 | return(a*67108864.0+b)*(1.0/9007199254740992.0); 179 | } 180 | /* These real versions are due to Isaku Wada, 2002/01/09 added */ 181 | -------------------------------------------------------------------------------- /nsop_test/uTR/MT.h: -------------------------------------------------------------------------------- 1 | /* 2 | A C-program for MT19937, with initialization improved 2002/1/26. 3 | Coded by Takuji Nishimura and Makoto Matsumoto. 4 | 5 | Before using, initialize the state by using init_genrand(seed) 6 | or init_by_array(init_key, key_length). 7 | 8 | Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, 9 | All rights reserved. 10 | 11 | Redistribution and use in source and binary forms, with or without 12 | modification, are permitted provided that the following conditions 13 | are met: 14 | 15 | 1. Redistributions of source code must retain the above copyright 16 | notice, this list of conditions and the following disclaimer. 17 | 18 | 2. Redistributions in binary form must reproduce the above copyright 19 | notice, this list of conditions and the following disclaimer in the 20 | documentation and/or other materials provided with the distribution. 21 | 22 | 3. The names of its contributors may not be used to endorse or promote 23 | products derived from this software without specific prior written 24 | permission. 25 | 26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 27 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 28 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 29 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 30 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 31 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 32 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 33 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 34 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 35 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | 38 | 39 | Any feedback is very welcome. 40 | http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html 41 | email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space) 42 | */ 43 | 44 | /* 45 | The original version of http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/MT2002/CODES/mt19937ar.c was modified by Takahiro Omi as 46 | - delete line 47 "#include" 47 | - delete line 174 int main(void){...} 48 | - change N -> MT_N 49 | - change N -> MT_N 50 | - change the file name "mt19937ar.c" -> "MT.h" 51 | */ 52 | 53 | 54 | /* Period parameters */ 55 | #define MT_N 624 56 | #define MT_M 397 57 | #define MATRIX_A 0x9908b0dfUL /* constant vector a */ 58 | #define UPPER_MASK 0x80000000UL /* most significant w-r bits */ 59 | #define LOWER_MASK 0x7fffffffUL /* least significant r bits */ 60 | 61 | static unsigned long mt[MT_N]; /* the array for the state vector */ 62 | static int mti=MT_N+1; /* mti==MT_N+1 means mt[MT_N] is not initialized */ 63 | 64 | /* initializes mt[MT_N] with a seed */ 65 | void init_genrand(unsigned long s) 66 | { 67 | mt[0]= s & 0xffffffffUL; 68 | for (mti=1; mti> 30)) + mti); 71 | /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */ 72 | /* In the previous versions, MSBs of the seed affect */ 73 | /* only MSBs of the array mt[]. */ 74 | /* 2002/01/09 modified by Makoto Matsumoto */ 75 | mt[mti] &= 0xffffffffUL; 76 | /* for >32 bit machines */ 77 | } 78 | } 79 | 80 | /* initialize by an array with array-length */ 81 | /* init_key is the array for initializing keys */ 82 | /* key_length is its length */ 83 | /* slight change for C++, 2004/2/26 */ 84 | void init_by_array(unsigned long init_key[], int key_length) 85 | { 86 | int i, j, k; 87 | init_genrand(19650218UL); 88 | i=1; j=0; 89 | k = (MT_N>key_length ? MT_N : key_length); 90 | for (; k; k--) { 91 | mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL)) 92 | + init_key[j] + j; /* non linear */ 93 | mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */ 94 | i++; j++; 95 | if (i>=MT_N) { mt[0] = mt[MT_N-1]; i=1; } 96 | if (j>=key_length) j=0; 97 | } 98 | for (k=MT_N-1; k; k--) { 99 | mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL)) 100 | - i; /* non linear */ 101 | mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */ 102 | i++; 103 | if (i>=MT_N) { mt[0] = mt[MT_N-1]; i=1; } 104 | } 105 | 106 | mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */ 107 | } 108 | 109 | /* generates a random number on [0,0xffffffff]-interval */ 110 | unsigned long genrand_int32(void) 111 | { 112 | unsigned long y; 113 | static unsigned long mag01[2]={0x0UL, MATRIX_A}; 114 | /* mag01[x] = x * MATRIX_A for x=0,1 */ 115 | 116 | if (mti >= MT_N) { /* generate N words at one time */ 117 | int kk; 118 | 119 | if (mti == MT_N+1) /* if init_genrand() has not been called, */ 120 | init_genrand(5489UL); /* a default initial seed is used */ 121 | 122 | for (kk=0;kk> 1) ^ mag01[y & 0x1UL]; 125 | } 126 | for (;kk> 1) ^ mag01[y & 0x1UL]; 129 | } 130 | y = (mt[MT_N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK); 131 | mt[MT_N-1] = mt[MT_M-1] ^ (y >> 1) ^ mag01[y & 0x1UL]; 132 | 133 | mti = 0; 134 | } 135 | 136 | y = mt[mti++]; 137 | 138 | /* Tempering */ 139 | y ^= (y >> 11); 140 | y ^= (y << 7) & 0x9d2c5680UL; 141 | y ^= (y << 15) & 0xefc60000UL; 142 | y ^= (y >> 18); 143 | 144 | return y; 145 | } 146 | 147 | /* generates a random number on [0,0x7fffffff]-interval */ 148 | long genrand_int31(void) 149 | { 150 | return (long)(genrand_int32()>>1); 151 | } 152 | 153 | /* generates a random number on [0,1]-real-interval */ 154 | double genrand_real1(void) 155 | { 156 | return genrand_int32()*(1.0/4294967295.0); 157 | /* divided by 2^32-1 */ 158 | } 159 | 160 | /* generates a random number on [0,1)-real-interval */ 161 | double genrand_real2(void) 162 | { 163 | return genrand_int32()*(1.0/4294967296.0); 164 | /* divided by 2^32 */ 165 | } 166 | 167 | /* generates a random number on (0,1)-real-interval */ 168 | double genrand_real3(void) 169 | { 170 | return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0); 171 | /* divided by 2^32 */ 172 | } 173 | 174 | /* generates a random number on [0,1) with 53-bit resolution*/ 175 | double genrand_res53(void) 176 | { 177 | unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6; 178 | return(a*67108864.0+b)*(1.0/9007199254740992.0); 179 | } 180 | /* These real versions are due to Isaku Wada, 2002/01/09 added */ 181 | -------------------------------------------------------------------------------- /test_public/gendata/MT.h: -------------------------------------------------------------------------------- 1 | /* 2 | A C-program for MT19937, with initialization improved 2002/1/26. 3 | Coded by Takuji Nishimura and Makoto Matsumoto. 4 | 5 | Before using, initialize the state by using init_genrand(seed) 6 | or init_by_array(init_key, key_length). 7 | 8 | Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, 9 | All rights reserved. 10 | 11 | Redistribution and use in source and binary forms, with or without 12 | modification, are permitted provided that the following conditions 13 | are met: 14 | 15 | 1. Redistributions of source code must retain the above copyright 16 | notice, this list of conditions and the following disclaimer. 17 | 18 | 2. Redistributions in binary form must reproduce the above copyright 19 | notice, this list of conditions and the following disclaimer in the 20 | documentation and/or other materials provided with the distribution. 21 | 22 | 3. The names of its contributors may not be used to endorse or promote 23 | products derived from this software without specific prior written 24 | permission. 25 | 26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 27 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 28 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 29 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 30 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 31 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 32 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 33 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 34 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 35 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | 38 | 39 | Any feedback is very welcome. 40 | http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html 41 | email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space) 42 | */ 43 | 44 | /* 45 | The original version of http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/MT2002/CODES/mt19937ar.c was modified by Takahiro Omi as 46 | - delete line 47 "#include" 47 | - delete line 174 int main(void){...} 48 | - change N -> MT_N 49 | - change N -> MT_N 50 | - change the file name "mt19937ar.c" -> "MT.h" 51 | */ 52 | 53 | 54 | /* Period parameters */ 55 | #define MT_N 624 56 | #define MT_M 397 57 | #define MATRIX_A 0x9908b0dfUL /* constant vector a */ 58 | #define UPPER_MASK 0x80000000UL /* most significant w-r bits */ 59 | #define LOWER_MASK 0x7fffffffUL /* least significant r bits */ 60 | 61 | static unsigned long mt[MT_N]; /* the array for the state vector */ 62 | static int mti=MT_N+1; /* mti==MT_N+1 means mt[MT_N] is not initialized */ 63 | 64 | /* initializes mt[MT_N] with a seed */ 65 | void init_genrand(unsigned long s) 66 | { 67 | mt[0]= s & 0xffffffffUL; 68 | for (mti=1; mti> 30)) + mti); 71 | /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */ 72 | /* In the previous versions, MSBs of the seed affect */ 73 | /* only MSBs of the array mt[]. */ 74 | /* 2002/01/09 modified by Makoto Matsumoto */ 75 | mt[mti] &= 0xffffffffUL; 76 | /* for >32 bit machines */ 77 | } 78 | } 79 | 80 | /* initialize by an array with array-length */ 81 | /* init_key is the array for initializing keys */ 82 | /* key_length is its length */ 83 | /* slight change for C++, 2004/2/26 */ 84 | void init_by_array(unsigned long init_key[], int key_length) 85 | { 86 | int i, j, k; 87 | init_genrand(19650218UL); 88 | i=1; j=0; 89 | k = (MT_N>key_length ? MT_N : key_length); 90 | for (; k; k--) { 91 | mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL)) 92 | + init_key[j] + j; /* non linear */ 93 | mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */ 94 | i++; j++; 95 | if (i>=MT_N) { mt[0] = mt[MT_N-1]; i=1; } 96 | if (j>=key_length) j=0; 97 | } 98 | for (k=MT_N-1; k; k--) { 99 | mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL)) 100 | - i; /* non linear */ 101 | mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */ 102 | i++; 103 | if (i>=MT_N) { mt[0] = mt[MT_N-1]; i=1; } 104 | } 105 | 106 | mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */ 107 | } 108 | 109 | /* generates a random number on [0,0xffffffff]-interval */ 110 | unsigned long genrand_int32(void) 111 | { 112 | unsigned long y; 113 | static unsigned long mag01[2]={0x0UL, MATRIX_A}; 114 | /* mag01[x] = x * MATRIX_A for x=0,1 */ 115 | 116 | if (mti >= MT_N) { /* generate N words at one time */ 117 | int kk; 118 | 119 | if (mti == MT_N+1) /* if init_genrand() has not been called, */ 120 | init_genrand(5489UL); /* a default initial seed is used */ 121 | 122 | for (kk=0;kk> 1) ^ mag01[y & 0x1UL]; 125 | } 126 | for (;kk> 1) ^ mag01[y & 0x1UL]; 129 | } 130 | y = (mt[MT_N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK); 131 | mt[MT_N-1] = mt[MT_M-1] ^ (y >> 1) ^ mag01[y & 0x1UL]; 132 | 133 | mti = 0; 134 | } 135 | 136 | y = mt[mti++]; 137 | 138 | /* Tempering */ 139 | y ^= (y >> 11); 140 | y ^= (y << 7) & 0x9d2c5680UL; 141 | y ^= (y << 15) & 0xefc60000UL; 142 | y ^= (y >> 18); 143 | 144 | return y; 145 | } 146 | 147 | /* generates a random number on [0,0x7fffffff]-interval */ 148 | long genrand_int31(void) 149 | { 150 | return (long)(genrand_int32()>>1); 151 | } 152 | 153 | /* generates a random number on [0,1]-real-interval */ 154 | double genrand_real1(void) 155 | { 156 | return genrand_int32()*(1.0/4294967295.0); 157 | /* divided by 2^32-1 */ 158 | } 159 | 160 | /* generates a random number on [0,1)-real-interval */ 161 | double genrand_real2(void) 162 | { 163 | return genrand_int32()*(1.0/4294967296.0); 164 | /* divided by 2^32 */ 165 | } 166 | 167 | /* generates a random number on (0,1)-real-interval */ 168 | double genrand_real3(void) 169 | { 170 | return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0); 171 | /* divided by 2^32 */ 172 | } 173 | 174 | /* generates a random number on [0,1) with 53-bit resolution*/ 175 | double genrand_res53(void) 176 | { 177 | unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6; 178 | return(a*67108864.0+b)*(1.0/9007199254740992.0); 179 | } 180 | /* These real versions are due to Isaku Wada, 2002/01/09 added */ 181 | -------------------------------------------------------------------------------- /SAIS.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------- 2 | // Induced Sorting 3 | // Reference: Nong, Ge, Sen Zhang, and Wai Hong Chan. "Two efficient algorithms for linear time suffix array construction.” IEEE Transactions on Computers, 60.10 (2011): 1471-1484. 4 | // To compile, perform: gcc -std=c99 SAIS.c -o SAIS 5 | // To run, SALS . e.g., SAIS 100 6 | //------------------------------------------------------------- 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "uTR.h" 13 | 14 | unsigned char mask[]={0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01}; 15 | // S と L から構成される列を8文字毎に区切り、tyope S を 1, L を 0 でビット表現して 8ビット(1バイト)とし、配列 t に入れる. たとえば LSLSSLSS は 01011011 にコードする(主記憶を節約する本格的なプログラミング技術) 16 | // 配列 t から i 番目の文字の type b を取り出すマクロ 17 | // ビット演算 or演算は |, and 演算は &, ビットの反転は ~ である 18 | #define tget(i) ( (t[(i)/8] & mask[(i)%8]) ? 1 : 0 ) 19 | // i 番目の文字の type が b であることを配列 t に入れるマクロ 20 | #define tset(i, b) t[(i)/8] = (b) ? (mask[(i)%8] | t[(i)/8]) : ((~mask[(i)%8]) & t[(i)/8]) 21 | // 配列 s からのデータの取り出しを cs が sizeof(int) なら整数として s[i] を取り出し、そうでなければ unsigned char として取り出すマクロ 22 | #define chr(i) ( cs==sizeof(int) ? ((int*)s)[i] : ((unsigned char *)s)[i] ) 23 | // i 番目の文字がleft-most S-type (LMS) ならば true、そうでなければ false を返すマクロ 24 | #define isLMS(i) (i>0 && tget(i) && !tget(i-1)) 25 | 26 | // find the start or end of each bucket 27 | void getBuckets(unsigned char *s, int *bkt, int n, int K, int cs, bool end) { 28 | int i, sum=0; 29 | // clear all buckets. K は $ を除いた文字の数 30 | for(i=0; i<=K; i++) bkt[i]=0; 31 | // compute the size of each bucket 32 | for(i=0; i=0 && !tget(j)) SA[bkt[chr(j)]++]=j; 44 | // j 番目の文字が L-type (0) ならば、 ^ が置かれた位置 bkt[chr(j)] に j を入れて ^ を右へ1つ移動 45 | } 46 | } 47 | // compute SAs S-type の文字をソート 48 | void induceSAs(unsigned char *t, int *SA, unsigned char *s, int *bkt, int n, int K, int cs, bool end) { 49 | int i, j; 50 | // find ends of buckets 配列 bkt に最後の位置を入れる 51 | getBuckets(s, bkt, n, K, cs, end); 52 | for(i=n-1; i>=0; i--) { // i は @ の置かれた位置を表現 53 | j=SA[i]-1; 54 | if(j>=0 && tget(j)) SA[--bkt[chr(j)]]=j; 55 | // j 番目の文字が S-type (1) ならば、 ^ が置かれた位置 bkt[chr(j)] を1つ左に移動して j を入れる 56 | } 57 | } 58 | // find the suffix array SA of s[0..n-1] in {1..K}n require s[n-1]=0 (the sentinel!), n>=2 59 | // use a working space (excluding s and SA) of at most 2.25n+O(1) for a constant alphabet 60 | void SA_IS(unsigned char *s, int *SA, int n, int K, int cs) { 61 | // LS-type array in bits タイプ S もしくは L をビット表現して記載する配列を t とする 62 | unsigned char *t=(unsigned char *)malloc(n/8+1); 63 | int i, j; 64 | // classify the type of each character the sentinel must be in s1, important!!! 65 | // 各文字が S-type か L-type のどちらであるかを計算して配列 t に入れる 66 | tset(n-2, 0); tset(n-1, 1); // 最後から2番めの文字は 0 (L-type) で、最後の文字 $ は 1 (S-type) 67 | for(i=n-3; i>=0; i--) // 後ろの文字から埋めてゆく 68 | tset(i, (chr(i)0 && (isLMS(pos+d) || isLMS(prev+d))) 101 | break; // 次の LMS prefix まで到達し、一致したので diff = falseのまま抜ける 102 | if(diff) { name++; prev=pos; } // LMS prefix が一致しなければ新しい順番を name に設定 103 | pos=(pos%2==0) ? pos/2 : (pos-1)/2; // LMS は少なくとも1つおきに出現するので場所をつめる 104 | SA[n1+pos]=name-1; // 更新した name の1つ前の順番を付与する 105 | } 106 | for(i=n-1, j=n-1; i>=n1; i--) // -1 を除きながら LMS prefix をコードした文字列を SA の後半に詰める 107 | if(SA[i]>=0) SA[j--]=SA[i]; 108 | 109 | // solve the reduced problem 110 | // recurse if names are not yet unique 111 | // s1 は LMS prefix をコードした文字列で SA の後半を使うのに対して、その suffix array SA1 は SA の前半に計算することで主記憶を節約する(主記憶を節約する本格的なプログラミング技術) 112 | int *SA1=SA, *s1=SA+n-n1; 113 | // 同一順序の LMS prefix が存在するならば、LMS prefix の総数 n1 より name が小さくなるので SA_IS を再帰呼び出しして s1 の suffix array SA1 を決定 114 | if(name=0; i--) { 133 | j=SA[i]; SA[i]=-1; 134 | SA[--bkt[chr(j)]]=j; // LMS suffix の順番を bucket に入れる 135 | } 136 | // LMS suffix の順番が bucket 内に入ったので L-type をソートして後に S-type をソートする 137 | induceSAl(t, SA, s, bkt, n, K, cs, false); 138 | induceSAs(t, SA, s, bkt, n, K, cs, true); 139 | free(bkt); free(t); 140 | } 141 | -------------------------------------------------------------------------------- /nsop_test/uTR/SAIS.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------- 2 | // Induced Sorting 3 | // Reference: Nong, Ge, Sen Zhang, and Wai Hong Chan. "Two efficient algorithms for linear time suffix array construction.” IEEE Transactions on Computers, 60.10 (2011): 1471-1484. 4 | // To compile, perform: gcc -std=c99 SAIS.c -o SAIS 5 | // To run, SALS . e.g., SAIS 100 6 | //------------------------------------------------------------- 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "uTR.h" 13 | 14 | unsigned char mask[]={0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01}; 15 | // S と L から構成される列を8文字毎に区切り、tyope S を 1, L を 0 でビット表現して 8ビット(1バイト)とし、配列 t に入れる. たとえば LSLSSLSS は 01011011 にコードする(主記憶を節約する本格的なプログラミング技術) 16 | // 配列 t から i 番目の文字の type b を取り出すマクロ 17 | // ビット演算 or演算は |, and 演算は &, ビットの反転は ~ である 18 | #define tget(i) ( (t[(i)/8] & mask[(i)%8]) ? 1 : 0 ) 19 | // i 番目の文字の type が b であることを配列 t に入れるマクロ 20 | #define tset(i, b) t[(i)/8] = (b) ? (mask[(i)%8] | t[(i)/8]) : ((~mask[(i)%8]) & t[(i)/8]) 21 | // 配列 s からのデータの取り出しを cs が sizeof(int) なら整数として s[i] を取り出し、そうでなければ unsigned char として取り出すマクロ 22 | #define chr(i) ( cs==sizeof(int) ? ((int*)s)[i] : ((unsigned char *)s)[i] ) 23 | // i 番目の文字がleft-most S-type (LMS) ならば true、そうでなければ false を返すマクロ 24 | #define isLMS(i) (i>0 && tget(i) && !tget(i-1)) 25 | 26 | // find the start or end of each bucket 27 | void getBuckets(unsigned char *s, int *bkt, int n, int K, int cs, bool end) { 28 | int i, sum=0; 29 | // clear all buckets. K は $ を除いた文字の数 30 | for(i=0; i<=K; i++) bkt[i]=0; 31 | // compute the size of each bucket 32 | for(i=0; i=0 && !tget(j)) SA[bkt[chr(j)]++]=j; 44 | // j 番目の文字が L-type (0) ならば、 ^ が置かれた位置 bkt[chr(j)] に j を入れて ^ を右へ1つ移動 45 | } 46 | } 47 | // compute SAs S-type の文字をソート 48 | void induceSAs(unsigned char *t, int *SA, unsigned char *s, int *bkt, int n, int K, int cs, bool end) { 49 | int i, j; 50 | // find ends of buckets 配列 bkt に最後の位置を入れる 51 | getBuckets(s, bkt, n, K, cs, end); 52 | for(i=n-1; i>=0; i--) { // i は @ の置かれた位置を表現 53 | j=SA[i]-1; 54 | if(j>=0 && tget(j)) SA[--bkt[chr(j)]]=j; 55 | // j 番目の文字が S-type (1) ならば、 ^ が置かれた位置 bkt[chr(j)] を1つ左に移動して j を入れる 56 | } 57 | } 58 | // find the suffix array SA of s[0..n-1] in {1..K}n require s[n-1]=0 (the sentinel!), n>=2 59 | // use a working space (excluding s and SA) of at most 2.25n+O(1) for a constant alphabet 60 | void SA_IS(unsigned char *s, int *SA, int n, int K, int cs) { 61 | // LS-type array in bits タイプ S もしくは L をビット表現して記載する配列を t とする 62 | unsigned char *t=(unsigned char *)malloc(n/8+1); 63 | int i, j; 64 | // classify the type of each character the sentinel must be in s1, important!!! 65 | // 各文字が S-type か L-type のどちらであるかを計算して配列 t に入れる 66 | tset(n-2, 0); tset(n-1, 1); // 最後から2番めの文字は 0 (L-type) で、最後の文字 $ は 1 (S-type) 67 | for(i=n-3; i>=0; i--) // 後ろの文字から埋めてゆく 68 | tset(i, (chr(i)0 && (isLMS(pos+d) || isLMS(prev+d))) 101 | break; // 次の LMS prefix まで到達し、一致したので diff = falseのまま抜ける 102 | if(diff) { name++; prev=pos; } // LMS prefix が一致しなければ新しい順番を name に設定 103 | pos=(pos%2==0) ? pos/2 : (pos-1)/2; // LMS は少なくとも1つおきに出現するので場所をつめる 104 | SA[n1+pos]=name-1; // 更新した name の1つ前の順番を付与する 105 | } 106 | for(i=n-1, j=n-1; i>=n1; i--) // -1 を除きながら LMS prefix をコードした文字列を SA の後半に詰める 107 | if(SA[i]>=0) SA[j--]=SA[i]; 108 | 109 | // solve the reduced problem 110 | // recurse if names are not yet unique 111 | // s1 は LMS prefix をコードした文字列で SA の後半を使うのに対して、その suffix array SA1 は SA の前半に計算することで主記憶を節約する(主記憶を節約する本格的なプログラミング技術) 112 | int *SA1=SA, *s1=SA+n-n1; 113 | // 同一順序の LMS prefix が存在するならば、LMS prefix の総数 n1 より name が小さくなるので SA_IS を再帰呼び出しして s1 の suffix array SA1 を決定 114 | if(name=0; i--) { 133 | j=SA[i]; SA[i]=-1; 134 | SA[--bkt[chr(j)]]=j; // LMS suffix の順番を bucket に入れる 135 | } 136 | // LMS suffix の順番が bucket 内に入ったので L-type をソートして後に S-type をソートする 137 | induceSAl(t, SA, s, bkt, n, K, cs, false); 138 | induceSAs(t, SA, s, bkt, n, K, cs, true); 139 | free(bkt); free(t); 140 | } 141 | -------------------------------------------------------------------------------- /nsop_test/uTR/coverage_by_units.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------- 2 | // Induced Sorting 3 | // Reference: Nong, Ge, Sen Zhang, and Wai Hong Chan. "Two efficient algorithms for linear time suffix array construction.” IEEE Transactions on Computers, 60.10 (2011): 1471-1484. 4 | // To compile, perform: gcc -std=c99 SAIS.c -o SAIS 5 | // To run, SALS . e.g., SAIS 100 6 | //------------------------------------------------------------- 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "uTR.h" 13 | 14 | char int2char(int i){ 15 | char ans; 16 | switch(i){ 17 | case 1: ans = 'A'; break; 18 | case 2: ans = 'C'; break; 19 | case 3: ans = 'G'; break; 20 | case 4: ans = 'T'; break; 21 | default: fprintf(stderr, "Invalid integer: %d in int2char\n", i); exit(EXIT_FAILURE); 22 | } 23 | return(ans); 24 | } 25 | 26 | int char2int(char c){ 27 | int ans; 28 | switch(c){ 29 | case 'A': ans = 1; break; 30 | case 'C': ans = 2; break; 31 | case 'G': ans = 3; break; 32 | case 'T': ans = 4; break; 33 | default: fprintf(stderr, "Invalid char: %c in char2int\n", c); exit(EXIT_FAILURE); 34 | } 35 | return(ans); 36 | } 37 | 38 | int count_occurrences(char *S, int n, int *SA, int *C, int **OCC, char *unit, int unitLen, int *covered, int MIN_number_repetitions ) 39 | { 40 | for(int i=0; i ub || lb < 0 || n-1 < ub) 55 | return(0); 56 | } 57 | for(int k=lb; k<=ub; k++){ 58 | // Mark all letters in the unit occurrences. 59 | for(int j=0; j < (MIN_number_repetitions * unitLen) && SA[k]+j < n; j++) 60 | covered[ SA[k]+j ] = 1; 61 | } 62 | }else{ 63 | // Divide a unit into shorter windows to escape from mutations and errors 64 | count_occurrences_long_unit(S, n, SA, C, OCC, unit, unitLen, covered, MIN_number_repetitions ); 65 | } 66 | int total_cnt = 0; 67 | for(int i=0; i 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include "uTR.h" 36 | 37 | //#define DEBUG_feed 38 | 39 | void clear_Units_incrementally(){ 40 | for(int i=0; i') cnt++; 131 | fclose(fp); 132 | return(cnt); 133 | } 134 | 135 | FILE* init_handle_one_file(char *inputFile){ 136 | FILE *fp = fopen(inputFile, "r"); 137 | if(fp == NULL){ 138 | fprintf(stderr, "fatal error: cannot open %s\n", inputFile); 139 | fflush(stderr); 140 | exit(EXIT_FAILURE); 141 | } 142 | read_cnt = -1; 143 | return(fp); 144 | } 145 | 146 | void return_one_read(FILE *fp, Read *currentRead){ 147 | char s[BLK+1]; 148 | int i; 149 | char charCode; 150 | int cnt=0; 151 | int no_read = 1; 152 | 153 | while (fgets(s, BLK, fp) != NULL) { // Feed a string of size BLK from fp into string s 154 | no_read = 0; 155 | fflush(fp); 156 | 157 | if(s[0] == '>'){ 158 | if(read_cnt != -1){ // This is NOT the first read 159 | // Set the ID of currentRead to the ID of nextRead 160 | int j; 161 | for(j=0; nextReadID[j] != '\0'; j++) 162 | currentRead->ID[j] = nextReadID[j]; 163 | currentRead->ID[j] = '\0'; 164 | } 165 | // Feed the ID of the current read into the ID of nextRead 166 | int shift; 167 | if(s[1]==' ') shift=2; else shift=1; // Skip the space at the head 168 | for(i=0; s[shift+i]!='\0' && s[shift+i]!='\n' && s[shift+i]!='\r' && istring[cnt] = '\0'; 178 | currentRead->len = cnt; 179 | return; 180 | } 181 | }else{ 182 | // Feed the string 183 | for(i=0; s[i]!='\0' && s[i]!='\n' && s[i]!='\r' && istring[cnt] = capitalize(s[i]); 185 | currentRead->intString[cnt] = char2int(capitalize(s[i])); 186 | cnt++; 187 | if( MAX_READ_LENGTH <= cnt){ 188 | fprintf(stderr, "fatal error: The length %d is tentatively at most %i.\nread ID = %s\nSet MAX_READ_LENGTH to a larger value", cnt, MAX_READ_LENGTH, currentRead->ID); 189 | free_Units(); 190 | exit(EXIT_FAILURE); 191 | } 192 | } 193 | } 194 | } 195 | if(no_read == 1){ // No reads 196 | currentRead->len = 0; 197 | }else{ 198 | // Process the last read. 199 | // Set the ID of currentRead to the ID of nextRead 200 | int j; 201 | for(j=0; nextReadID[j] != '\0'; j++) 202 | currentRead->ID[j] = nextReadID[j]; 203 | currentRead->ID[j] = '\0'; 204 | // Finalize the currentRead string by appending '\0' 205 | currentRead->string[cnt] = '\0'; 206 | currentRead->len = cnt; 207 | read_cnt++; 208 | } 209 | } 210 | -------------------------------------------------------------------------------- /nsop_test/uTR/handle_one_file.c: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2021, Shinichi Morishita 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | The views and conclusions contained in the software and documentation are those 26 | of the authors and should not be interpreted as representing official policies, 27 | either expressed or implied, of the FreeBSD Project. 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include "uTR.h" 36 | 37 | //#define DEBUG_feed 38 | 39 | 40 | 41 | 42 | void clear_Units_incrementally(){ 43 | for(int i=0; i'){ 152 | if(read_cnt == -1){ // This is the first read 153 | read_cnt = 0; 154 | // Feed the ID of the current read into the ID of nextRead 155 | for(i=1; s[i]!='\0' && s[i]!='\n' && iID[j] = nextReadID[j]; 164 | currentRead->ID[j] = '\0'; 165 | // Feed the ID of the current read into the ID of nextRead 166 | for(i=1; s[i]!='\0' && s[i]!='\n' && istring[cnt] = '\0'; 171 | currentRead->len = cnt; 172 | read_cnt++; 173 | free(s); 174 | return; 175 | } 176 | }else{ 177 | // Feed the string 178 | for(i=0; s[i]!='\0' && s[i]!='\n' && s[i]!='\r'; i++){ 179 | currentRead->string[cnt++] = capitalize(s[i]); 180 | if( MAX_READ_LENGTH <= cnt){ 181 | fprintf(stderr, "fatal error: The length %d is tentatively at most %i.\nread ID = %s\nSet MAX_READ_LENGTH to a larger value", cnt, MAX_READ_LENGTH, currentRead->ID); 182 | free_Units(); 183 | exit(EXIT_FAILURE); 184 | } 185 | } 186 | } 187 | } 188 | free(s); 189 | if(no_read == 1){ // No reads 190 | currentRead->len = 0; 191 | }else{ 192 | // Process the last read. 193 | // Set the ID of currentRead to the ID of nextRead 194 | int j; 195 | for(j=0; nextReadID[j] != '\0'; j++) 196 | currentRead->ID[j] = nextReadID[j]; 197 | currentRead->ID[j] = '\0'; 198 | // Finalize the currentRead string by appending '\0' 199 | currentRead->string[cnt] = '\0'; 200 | currentRead->len = cnt; 201 | read_cnt++; 202 | } 203 | } 204 | -------------------------------------------------------------------------------- /uTR.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2019, Shinichi Morishita 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | The views and conclusions contained in the software and documentation are those 26 | of the authors and should not be interpreted as representing official policies, 27 | either expressed or implied, of the FreeBSD Project. 28 | */ 29 | 30 | #include 31 | 32 | // Key default parameters 33 | 34 | // The maximum discrepancy ratio 35 | #define MAX_DIS_RATIO_DEFAULT 0.3 // Setting this to 0.05 or 0.1 is too strict to overlook meaningful decompositions 36 | float MAX_DIS_RATIO; 37 | 38 | //#define MIN_unit_occupancy_ratio 0.8 // 0.6 39 | // We discarded this parameter. Instead, we search for a set of units that minimizes the penalty. 40 | #define MIN_unit_length 2 41 | #define TOP_k_units 5 // Print top k units for debugging 42 | #define MIN_COVERAGE 0.95 // Minimum coverage of a TR by key units 43 | 44 | // Internal variables and data structures 45 | #define MAX_NUMBER_READS 10000 // Not used as of 5/7/2023 as the number of reads is calculated. 46 | #define MAX_READ_LENGTH 20000 //100000 47 | #define MAX_ID_LENGTH 1000 48 | #define BLK 4096 49 | #define MAX_NUMBER_UNITS 100000 50 | #define MAX_NUMBER_GLOBAL_UNITS 10000 //10000 51 | #define MAX_UNIT_LENGTH 20 // Units of length MAX_UNIT_LENGTH or less are generated inside this program for efficiency 52 | #define INTERNAL_UNIT_LENGTH 1000 // Users are allowed to input long representative units. 53 | int repUnitLen; 54 | int read_cnt; 55 | char *nextReadID; 56 | 57 | // Parameters for processing long units 58 | #define LONG_UNIT_LEN_TH 20 //10 59 | #define WINDOW_LEN 3 //5 60 | 61 | // Parameters for controling bounded alignment among long units 62 | #define MATCH 1 63 | #define MISMATCH -1 64 | #define INDEL -1 65 | #define Mosaic_tandem_repeat 0 66 | #define Mosaic_repeat 1 67 | #define UNDEFINED -1 68 | 69 | // Parameters for wrap_around_DP and string_decomposer 70 | #define MATCH_GAIN 1 71 | #define MISMATCH_PENALTY 1 72 | #define INDEL_PENALTY 1 73 | #define unit_PENALTY 1 74 | 75 | typedef struct{ 76 | int len; 77 | char string[MAX_READ_LENGTH]; 78 | int intString[MAX_READ_LENGTH]; 79 | char ID[MAX_ID_LENGTH]; 80 | int numKeyUnits; 81 | int mosaic_mode; //Mosaic_tandem_repeat or Mosaic_repeat 82 | int sumTandem; // The sum of bases in mosaic TANDEM repeats 83 | float discrepancy_ratio; 84 | float mismatch_ratio; 85 | float deletion_ratio; 86 | float insertion_ratio; 87 | char RegExpression[MAX_READ_LENGTH]; // Decomposition (e.g., D=(AAAG)6,(AG)27,(AAAG)24 or list of prios (e.g., 0000000011111000111) 88 | int RegExpressionDecomp; // 1 means a decomposition and 0 a list of prios 89 | char decomposition[MAX_READ_LENGTH]; 90 | //char pattern_string[MAX_READ_LENGTH]; // String of RegExpression 91 | //char preciseRegExp[MAX_READ_LENGTH]; // Regular expression identical to the input string 92 | } Read; 93 | 94 | typedef struct{ 95 | int len; 96 | int numReads; 97 | int numKeyUnits; 98 | int mosaic_mode; //Mosaic_tandem_repeat or Mosaic_repeat 99 | int sumTandem; // The sum of bases in mosaic TANDEM repeats 100 | float discrepancy_ratio; 101 | char individualID[MAX_ID_LENGTH]; 102 | char readID[MAX_ID_LENGTH]; 103 | char RegExpression[MAX_READ_LENGTH]; 104 | char decomposition[MAX_READ_LENGTH]; 105 | } QualifiedRead; 106 | QualifiedRead *Qreads; 107 | 108 | typedef struct{ 109 | char string[INTERNAL_UNIT_LENGTH]; // MAX_UNIT_LENGTH 110 | int intString[INTERNAL_UNIT_LENGTH]; // string encoded by integers A=0,C=1,G=2,T=3 111 | int covered[MAX_READ_LENGTH]; // MAX_READ_LENGTH 112 | int ID; // quadratic number 113 | int len; 114 | int sumOccurrences; // Total number of bases that match the unit 115 | int sumTandem; // Total number of bps that match tandem repeat of the unit 116 | int prio; 117 | int penalty; 118 | } Unit; // The priority 1,2,... in the set of mosaic repeat units. -1 if the unit is not in the set. 119 | Unit *Units; 120 | Unit *keyUnits; 121 | int unit_cnt; 122 | 123 | typedef struct{ 124 | char individualID[MAX_ID_LENGTH]; 125 | int readID; 126 | char pairHaps[100]; 127 | } Hap; 128 | Hap *Haps; 129 | int hap_cnt; 130 | 131 | 132 | Unit *GlobalUnits; 133 | int global_unit_cnt; 134 | 135 | #define WrapDPsize 200000000 // 2*10^8 (200M) = repeat_unit_size (200) x 100 units x length_of_repeats (10,000) 136 | int *WrapDP; // 2D space for Wrap-around global alignment DP for handling tandem repeats 137 | 138 | int count_reads(char *inputFile); 139 | FILE* init_handle_one_file(char *inputFile); 140 | void return_one_read(FILE *fp, Read *currentRead); 141 | 142 | void malloc_Units(int num_reads); 143 | //void malloc_Units(); 144 | void free_Units(); 145 | void clear_Units_incrementally(); 146 | 147 | void malloc_GlobalVars(); 148 | void free_GlobalVars(); 149 | void put_into_GlobalUnits(char *tmpUnit); 150 | void print_GlobalUnits(); 151 | 152 | char int2char(int i); 153 | void put_repUnit(char *repUnit); 154 | 155 | void SA_IS(unsigned char *s, int *SA, int n, int K, int cs); 156 | 157 | void coverage_by_units(char *S, int MIN_number_repetitions); 158 | void set_cover_greedy(Read *currentRead, int MIN_number_repetitions, int mode_longer_TRs, int mode_smooth); 159 | 160 | int wrap_around_DP(char *rep_unit, char *rep, int *t_mat, int *t_mis, int *t_ins, int *t_del); 161 | void string_decomposer(Read *currentRead, int numKeyUnits, int *prio2unit, int MIN_number_repetitions, int smooth_mode, int longer_unit_mode); 162 | 163 | void randomQuickSort3(int* target, int* Pos, int aLeft, int aRight); 164 | 165 | void smooth(int *input_blocks, int len, int numKeyUnits); 166 | //void comp_preciseRegExp(Read *currentRead); 167 | 168 | // Interface between C and C++ functions 169 | #ifndef __CSUB_H__ 170 | #define __CSUB_H__ 1 171 | #ifdef __cplusplus 172 | extern "C" { 173 | #endif /* __cplusplus */ 174 | // C function called from a C++ function 175 | extern void put_unit(char *unit); 176 | extern void retain_top_k_units(int topK); 177 | extern int char2int(char c); 178 | extern void match_bounded_DP_traceback(char *s0, int n0, char *s1, int n1, int *covered); 179 | // C++ function called from a C function 180 | extern void get_non_self_overlapping_prefixes(char *aString); 181 | extern int lzp(char *aString); 182 | extern void count_occurrences_long_unit(char *S, int n, int *SA, int *C, int **OCC, char *unit, int unitLen, int *tmpCovered, int MIN_number_repetitions ); 183 | #ifdef __cplusplus 184 | } 185 | #endif /* __cplusplus */ 186 | #endif /* __CSUB_H__ */ 187 | 188 | 189 | // External functions 190 | #define MIN(a, b) ((a) < (b) ? (a) : (b)) 191 | #define MAX(a, b) ((a) > (b) ? (a) : (b)) 192 | #define DIFF(x, y) ((x) > (y) ? ((x) - (y)) : ((y) - (x))) 193 | 194 | // Debug Mode 195 | //#define DEBUG_longer_units 196 | //#define DEBUG_EDDC_mode // Show the frequency of each init. 197 | 198 | 199 | float time_get_non_self_overlapping_prefixes, time_coverage_by_units, time_set_cover_greedy; 200 | 201 | 202 | -------------------------------------------------------------------------------- /realdata/realdata.fasta: -------------------------------------------------------------------------------- 1 | > SAND12(control,BAFME) hg38_dna range=chr8:118366813-118366928 in the 4th intron of SAMD12, pattern=23 2 | AAATAAAATAAAATAAAATAAAATAAAATAAAATAAAATAAAATAAAATAAAATAAAATAAAATAAAATAATAAAATAAAATAAAATAAAATAAAATAAAATAAAAATGAACAAAA 3 | > SAND12(case,BAFME) Tandem repeat in the 4th intron of SAND12 found in Patient II-1 in family F6115 (Supplementary Figure 6, Ishiura, H. et al. Expansions of intronic TTTCA and TTTTA repeats in benign adult familial myoclonic epilepsy. Nat Genet 50, 581–590 (2018)) pattern=22122182 4 | TTTTTATTTTATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTATTTTATTTTATTTGTTTTATTTTGTTTTGATTTTATTTTATTTTATTTTATTTTATTTTATTTGTTTTGTTTTATTTATTTTATTTTATTTTGTTTTATTTTATTTATTTTGTTTATTTTATTTTATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTTATTTTATTTTTATTTTATTTTGTTTTATTTTATTTTTATTTTGTTTATTTTATTTTTATTTTATTTTGTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTCAATTTTATTTTTATTTTATTTTGATTTTATTTTTATTTTATTTTATTTTATTTTATTTGTTTTTATTTTATTTTATTTTGTTTATTTTATTTTATTTATTTATTTTGTTTTCATTTCTTGTTTTATTTTATTTTATTTTTATTTATTTTATTCATTTTATTTTATTTTATTTTGTTTTATTTTATTTTATTTTGTTTATTTTATTTTGTTTGATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTGTTTCATTTTATTTTATTTTATTTTATTTTATTTTAATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTGTTTTATTTTATTTTGTTTTATTTTATTTATTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTTATTTGTTTTATTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTGTTTTATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTATTTATTTATTTATTTATTTCATTTCATTTATCATTCATTTCATTTTGTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCATTTCATTTCATTTCATTTCATTTCATTTATTTCATTTCATTTTCATTTATTTCGTTTCATTTCATTTCATTTCATTTCATTTTATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCATTTCATTTATTTCATTTCATTTCATTTATTTCATTTCATTTTCATTCATTTTCATTTCATTCATTTCAGCTGTTTCATTTCATTTCATTTCTATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTATTCATTTCATTTCATTTCATTTCATTTCATTTCATTTATTTCATTTCATTTTATTTCATTTCATTTTATTTCATTTCATTTCTATTTCATTTCATTTCATTTTATTTCATTTTCATTTATTTCATTCGTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCATTTCATTTCATTTTCATTTCAAGTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCGTTTCATTTCATTTTATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTATTTCGTTTCATTTCATTTATTTCATTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTTGTTTCATTTCATTTATTTCATTCATTTCTCTATTTTCATTTCATTTCATTCATTTCATTTATTCATTTCATTTTATTTCATTTATTTCATTTCATTTCTTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCTCATTTCATTTTCATTTCGACTTTCATTTTATTTTATTTATTTTATTTTATTTTATTTGTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTGTTTATTTTATTTATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTATTTTTGATGATTTTATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTGTTTTATTTTTATTTTGTTTTA 5 | > SAND12(case,BAFME) Tandem repeat in the 4th intron of SAND12 found in (Supplementary Figure 6, Ishiura, H. et al. Expansions of intronic TTTCA and TTTTA repeats in benign adult familial myoclonic epilepsy. Nat Genet 50, 581–590 (2018)) pattern=6133205130 6 | TTTTATTTTATTTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTATTTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTGTTTTATTTGTTTTATTTTATTTTATTTTTATTTATTTTATTTTAATTTTATTTTATTTGTATTTTTCATTTTTATTTTATTTTATTTTATTTTATTTATTTTATTTATTTTATTTTATTTTATTTTGTTTTTATTTTATTTTATTTTATTTTATTTTATTTTGTTTTATTTTATTTTATTTTATTTTATTTTATTTCCATTTTATTTTATTTTATTTTATTTTGTTTTATTTTGTTTGTTTTATTTATTTATTTTAATTTTATTTTATTTGTTTATTTTGTTTTATTTTATTTTGTTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTAATTTTATTTTATTTTTAATTTTATTTTGTTTTATTTTATTTGTTTGTTTTATTTTGTTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTGTTTTGTTTTTATTTTTATTTTATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTGTTTTATTTTATTTTTATTTTTATTTTATTTTATTTTATTTTATTTTTGTTTTATTTTATTTTATTTTGTTTTTATTTTATTTTATTTTGATTTTATTTCCATTTTATTTTATTTTATTTTATTTTGTTTTATTTTATTTTTATTTTATTTTATTTTGTTTTATTTTATTTTATTTTAATTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTATTTTATTTTATTTCATTTTATTTTATTTTATTTATTTTATTTTGTTTTATTTTATTTTATTTTTATTTTATTTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTTATTTTATTTTTATTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTGTTTACTGATTTTATTTTATTTTGTTTCCATTTTATTTTATTTGGTGATCATTTTATTTTGTTTTATTTTTATTTTATTTTATTTTATTTGTTTTATTTTGTTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTATTTTTATTTTATTTTGTTTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTTATTTTAATTTTTATTTTATTTTATTTTATTTTATTTTTGTTTTATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTGTTTTTATTTTTATTTTATTTATTTTATTTTTATTTTATTTTATTTTATTTTTATTTTGTTTTGTTTTATTTTATTTTGTTTTATTTTAATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTATTTTTATTTTGTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTGTTTTAATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTGTTTATTTAATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTAATTTTATTTTATTTTATTTGTATTTTATTTATTTTATTTTATTTTGTTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTTATTTTTATTTTATTTTATTTTGATTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTTTATTTATTTTATTTTATTTTATTTCATTTTATTTTATTTTATTTTGTTTGTTTTATTTTTATTTTGATTTTATTTTATTTTATTTTATTCATATTTTATTTTATTTTATTTTACATTTTTATTTTAATTTTATTTTATTTTAATTTCATTTTATTTTTATTTATTTTATTTTTATTTTATTTTATTTTATTTATTTTATTTTGTTTTAATTTTATTTTATTTTATTTTGTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTGTTTTATTTTATTTTGTTTTATTTTATTTTATTTTGTTTATTTTATTTTGTTTATTTGACATTTTATTTTAATTTTATTTTATTTTATTTTTGTTTTATTTTAATTTTATTTTATTTTATTTTTATTTTAATTTTATTTTATTTTATTTTATTTTTAATTTTTATTTTATTTTATTTTGTATTTTATTTTATTTTTATTTTATTTTATTTTATTTTGTTTTAATTTTTATTTTGTTTTGTTTTATTTTATTTTATTTTATCATTATTTTATTTTATTTTACATTTTATTTTGTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTCATTTTATTTTATTTTGTTTTATTTTATTTTATTTTATTTTATTTTGTTTTATTTTGATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTTATTTGTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTGTTTTATTTTATTTATTTTATTTTATTTTATTTGTTTTATTTTATTTTATTTTATTTTATTTTATTTAGATGATTTTATTTTATTTTAATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTCATTTCATTTCATTTCATTTCATTTCGTTTCATTTCGTTTCATTTCATTTCATTTCATTTATTTCATTTCATCATTTCATTTCATTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCGTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTTCATTTCATTTATTTCATTTCATTTCATTTCATTTCATTTCATTTTATTTCGTTTCATTTCATTTCATTTCATTTCATTTCATTCATTTCATTTCATTTTTCATTTCATTTCATTTATATTTTCATTTCATTTCATTTCATTTATTTCATTTCATTTCATTTCATTTCATTTCATTTATTTCATTTCATTCATTTATTTCATTTCATTTCATTTCATTTATTTCATTTCATTTCATTTCTGTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTTCATTTCATTTATTCATTTCATTTCATTTCATTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTTCATTTCATTTATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCATTTATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCATTTCATTTTCATTTATTTCATTTCATTTCATTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTATTTCATTTCATTTCATTTCATTCTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTTCATTTCATTTCATTTCATTTGTTTCATTTCATTTCATTTCATTTATTTCATTTTCATTATTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCTATTTTAGTTTCGTTCATTTCATTTCGTTTCATTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCGTTTCATTTCATTTCATTTCATATTTCATTTCATTTCATTTCATTTCGTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCGTTTCATTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTTATTTCATTTCATTTTCATTTCATTTCATTTCATTTAATTTCATTTCATTTCATTTTATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCATTTCATTTCATTTCATTTCGATATTTCATTTCATTTTATTTTTCATTTTGTTTTCATTTCATTTCATTTCATTTCATTTCATTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTTATTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCATTCATTTCATTTATTTCATTTCATTTCATTTCATTTCATTTCATTTTATTTCATTTCATTTCATTTCATTTCATTTCATTCGTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCATTTCATTTCATTCATTTCATTTGGTCGATTTCATTTCATTTCATTCTACTATTTCATTTCATTTGTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTATTCATTTCATTTCATTTCATTTCACTATTTTCATTTCATTTCATTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCA 7 | > RFC1(control,CANVAS) hg38_dna range=chr4:39348425-39348483 pattern=11 8 | AAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAA 9 | > KAZN(control) hg38_dna range=chr1:14883297-14883426 pattern=61120 10 | AAAGAAAGAAAGAAAGAAAGAAAGAGAGAGAGAGAGAGAGAGAGAGAAAGAAAGAAAGAAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAA 11 | > ZNF37A(control) hg38_dna range=chr10:38112731-38112826 pattern=1232 12 | TTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTGTCTTGTCTTGTCTTCTTTTCTTTTCTTT 13 | -------------------------------------------------------------------------------- /realdata/realdata_result.fasta: -------------------------------------------------------------------------------- 1 | > #Info (116,1,0.06) #Pat 23 #Annotation SAND12(control,BAFME) hg38_dna range=chr8:118366813-118366928 in the 4th intron of SAMD12, pattern=23 2 | AAATAAAATAAAATAAAATAAAATAAAATAAAATAAAATAAAATAAAATAAAATAAAATAAAATAAAATAATAAAATAAAATAAAATAAAATAAAATAAAATAAAAATGAACAAAA 3 | > #Info (2630,1,0.06) #Pat 22122182 #Annotation SAND12(case,BAFME) Tandem repeat in the 4th intron of SAND12 found in Patient II-1 in family F6115 (Supplementary Figure 6, Ishiura, H. et al. Expansions of intronic TTTCA and TTTTA repeats in benign adult familial myoclonic epilepsy. Nat Genet 50, 581–590 (2018)) pattern=22122182 4 | TTTTTATTTTATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTATTTTATTTTATTTGTTTTATTTTGTTTTGATTTTATTTTATTTTATTTTATTTTATTTTATTTGTTTTGTTTTATTTATTTTATTTTATTTTGTTTTATTTTATTTATTTTGTTTATTTTATTTTATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTTATTTTATTTTTATTTTATTTTGTTTTATTTTATTTTTATTTTGTTTATTTTATTTTTATTTTATTTTGTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTCAATTTTATTTTTATTTTATTTTGATTTTATTTTTATTTTATTTTATTTTATTTTATTTGTTTTTATTTTATTTTATTTTGTTTATTTTATTTTATTTATTTATTTTGTTTTCATTTCTTGTTTTATTTTATTTTATTTTTATTTATTTTATTCATTTTATTTTATTTTATTTTGTTTTATTTTATTTTATTTTGTTTATTTTATTTTGTTTGATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTGTTTCATTTTATTTTATTTTATTTTATTTTATTTTAATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTGTTTTATTTTATTTTGTTTTATTTTATTTATTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTTATTTGTTTTATTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTGTTTTATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTATTTATTTATTTATTTATTTCATTTCATTTATCATTCATTTCATTTTGTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCATTTCATTTCATTTCATTTCATTTCATTTATTTCATTTCATTTTCATTTATTTCGTTTCATTTCATTTCATTTCATTTCATTTTATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCATTTCATTTATTTCATTTCATTTCATTTATTTCATTTCATTTTCATTCATTTTCATTTCATTCATTTCAGCTGTTTCATTTCATTTCATTTCTATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTATTCATTTCATTTCATTTCATTTCATTTCATTTCATTTATTTCATTTCATTTTATTTCATTTCATTTTATTTCATTTCATTTCTATTTCATTTCATTTCATTTTATTTCATTTTCATTTATTTCATTCGTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCATTTCATTTCATTTTCATTTCAAGTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCGTTTCATTTCATTTTATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTATTTCGTTTCATTTCATTTATTTCATTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTTGTTTCATTTCATTTATTTCATTCATTTCTCTATTTTCATTTCATTTCATTCATTTCATTTATTCATTTCATTTTATTTCATTTATTTCATTTCATTTCTTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCTCATTTCATTTTCATTTCGACTTTCATTTTATTTTATTTATTTTATTTTATTTTATTTGTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTGTTTATTTTATTTATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTATTTTTGATGATTTTATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTGTTTTATTTTTATTTTGTTTTA 5 | > #Info (5346,1,0.06) #Pat 6133205130 #Annotation SAND12(case,BAFME) Tandem repeat in the 4th intron of SAND12 found in (Supplementary Figure 6, Ishiura, H. et al. Expansions of intronic TTTCA and TTTTA repeats in benign adult familial myoclonic epilepsy. Nat Genet 50, 581–590 (2018)) pattern=6133205130 6 | TTTTATTTTATTTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTATTTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTGTTTTATTTGTTTTATTTTATTTTATTTTTATTTATTTTATTTTAATTTTATTTTATTTGTATTTTTCATTTTTATTTTATTTTATTTTATTTTATTTATTTTATTTATTTTATTTTATTTTATTTTGTTTTTATTTTATTTTATTTTATTTTATTTTATTTTGTTTTATTTTATTTTATTTTATTTTATTTTATTTCCATTTTATTTTATTTTATTTTATTTTGTTTTATTTTGTTTGTTTTATTTATTTATTTTAATTTTATTTTATTTGTTTATTTTGTTTTATTTTATTTTGTTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTAATTTTATTTTATTTTTAATTTTATTTTGTTTTATTTTATTTGTTTGTTTTATTTTGTTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTGTTTTGTTTTTATTTTTATTTTATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTGTTTTATTTTATTTTTATTTTTATTTTATTTTATTTTATTTTATTTTTGTTTTATTTTATTTTATTTTGTTTTTATTTTATTTTATTTTGATTTTATTTCCATTTTATTTTATTTTATTTTATTTTGTTTTATTTTATTTTTATTTTATTTTATTTTGTTTTATTTTATTTTATTTTAATTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTATTTTATTTTATTTCATTTTATTTTATTTTATTTATTTTATTTTGTTTTATTTTATTTTATTTTTATTTTATTTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTTATTTTATTTTTATTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTGTTTACTGATTTTATTTTATTTTGTTTCCATTTTATTTTATTTGGTGATCATTTTATTTTGTTTTATTTTTATTTTATTTTATTTTATTTGTTTTATTTTGTTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTATTTTTATTTTATTTTGTTTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTTATTTTAATTTTTATTTTATTTTATTTTATTTTATTTTTGTTTTATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTGTTTTTATTTTTATTTTATTTATTTTATTTTTATTTTATTTTATTTTATTTTTATTTTGTTTTGTTTTATTTTATTTTGTTTTATTTTAATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTATTTTTATTTTGTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTGTTTTAATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTGTTTATTTAATTTTATTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTAATTTTATTTTATTTTATTTGTATTTTATTTATTTTATTTTATTTTGTTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTATTTTATTTTATTTTATTTTTATTTTTATTTTATTTTATTTTGATTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTTTATTTATTTTATTTTATTTTATTTCATTTTATTTTATTTTATTTTGTTTGTTTTATTTTTATTTTGATTTTATTTTATTTTATTTTATTCATATTTTATTTTATTTTATTTTACATTTTTATTTTAATTTTATTTTATTTTAATTTCATTTTATTTTTATTTATTTTATTTTTATTTTATTTTATTTTATTTATTTTATTTTGTTTTAATTTTATTTTATTTTATTTTGTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTGTTTTATTTTATTTTGTTTTATTTTATTTTATTTTGTTTATTTTATTTTGTTTATTTGACATTTTATTTTAATTTTATTTTATTTTATTTTTGTTTTATTTTAATTTTATTTTATTTTATTTTTATTTTAATTTTATTTTATTTTATTTTATTTTTAATTTTTATTTTATTTTATTTTGTATTTTATTTTATTTTTATTTTATTTTATTTTATTTTGTTTTAATTTTTATTTTGTTTTGTTTTATTTTATTTTATTTTATCATTATTTTATTTTATTTTACATTTTATTTTGTATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTCATTTTATTTTATTTTGTTTTATTTTATTTTATTTTATTTTATTTTGTTTTATTTTGATTTTATTTTATTTTATTTTATTTTATTTTTATTTTATTTTATTTTATTTGTTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTGTTTTATTTTATTTATTTTATTTTATTTTATTTGTTTTATTTTATTTTATTTTATTTTATTTTATTTAGATGATTTTATTTTATTTTAATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTTATTTCATTTCATTTCATTTCATTTCATTTCGTTTCATTTCGTTTCATTTCATTTCATTTCATTTATTTCATTTCATCATTTCATTTCATTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCGTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTTCATTTCATTTATTTCATTTCATTTCATTTCATTTCATTTCATTTTATTTCGTTTCATTTCATTTCATTTCATTTCATTTCATTCATTTCATTTCATTTTTCATTTCATTTCATTTATATTTTCATTTCATTTCATTTCATTTATTTCATTTCATTTCATTTCATTTCATTTCATTTATTTCATTTCATTCATTTATTTCATTTCATTTCATTTCATTTATTTCATTTCATTTCATTTCTGTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTTCATTTCATTTATTCATTTCATTTCATTTCATTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTTCATTTCATTTATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCATTTATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCATTTCATTTTCATTTATTTCATTTCATTTCATTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTATTTCATTTCATTTCATTTCATTCTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTTCATTTCATTTCATTTCATTTGTTTCATTTCATTTCATTTCATTTATTTCATTTTCATTATTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCTATTTTAGTTTCGTTCATTTCATTTCGTTTCATTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCGTTTCATTTCATTTCATTTCATATTTCATTTCATTTCATTTCATTTCGTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCGTTTCATTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTTATTTCATTTCATTTTCATTTCATTTCATTTCATTTAATTTCATTTCATTTCATTTTATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCATTTCATTTCATTTCATTTCGATATTTCATTTCATTTTATTTTTCATTTTGTTTTCATTTCATTTCATTTCATTTCATTTCATTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTTATTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCATTCATTTCATTTATTTCATTTCATTTCATTTCATTTCATTTCATTTTATTTCATTTCATTTCATTTCATTTCATTTCATTCGTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTCATTTCATTTCATTCATTTCATTTGGTCGATTTCATTTCATTTCATTCTACTATTTCATTTCATTTGTTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTATTCATTTCATTTCATTTCATTTCACTATTTTCATTTCATTTCATTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCATTTCA 7 | > #Info (59,1,0.00) #Pat 11 #Annotation RFC1(control,CANVAS) hg38_dna range=chr4:39348425-39348483 pattern=11 8 | AAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAA 9 | > #Info (130,1,0.01) #Pat 61120 #Annotation KAZN(control) hg38_dna range=chr1:14883297-14883426 pattern=61120 10 | AAAGAAAGAAAGAAAGAAAGAAAGAGAGAGAGAGAGAGAGAGAGAGAAAGAAAGAAAGAAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAA 11 | > #Info (96,1,0.03) #Pat 1223 #Annotation ZNF37A(control) hg38_dna range=chr10:38112731-38112826 pattern=1232 12 | TTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTGTCTTGTCTTGTCTTCTTTTCTTTTCTTT 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | The program uTR decomposes a DNA string into mosaic tandem repeats with different repeat units. 3 | 4 | ## Usage 5 | uTR [-f input fasta file] [-o output fasta file with annotation] [-l locus information] [-u input representative unit string] [-r maximum dissimilarity ratio] [-t] [-s] [-y] [-z] 6 | 7 | -f : Feed a fasta file. For example: 8 | 9 | > sample sequence 10 | CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAACAGCCGCCACCGCCGCCGCCGCCGCCGCCGCCTCCTCAGCTTCCTCAGCCGCCGCCGCAGGCACAGCCGCTGCTGCCTCAGCCGCAGCCGCCCCCGCCGCCGCCCCCCGCCGCCACC 11 | 12 | -o : Output the input fasta file annotated with a tandem repeat pattern identified. In the running example, the annotation contains "#Pat \19\38", which shows a tandem repeat pattern, and "#Info (171,125,0.129)", where 171 shows the length of the DNA string, 125 means the number of haplotypes detected, and 0.129 is the dissimilarity ratio between the tandem repeat pattern and the string: 13 | 14 | > #Info (171,125,0.129) #Pat 1938 #Decomp [2 (0,CCG,3,114,114) (1,AGC,3,57,57)] 15 | CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAACAGCCGCCACCGCCGCCGCCGCCGCCGCCGCCTCCTCAGCTTCCTCAGCCGCCGCCGCAGGCACAGCCGCTGCTGCCTCAGCCGCAGCCGCCCCCGCCGCCGCCCCCCGCCGCCACC 16 | 17 | Unit strings can be rotated to represent tandem repeating patterns. For example, in the example above, CAG can be replaced with AGC. Our program tries to output a tandem repeating pattern whose first character matches the input string. In the above case, the string starts with the letter C, so the first tandem repeat unit also starts with the letter C. However, if the input string starts with A and is AGCAGCAGC…, our program outputs \19. 18 | 19 | 20 | -l : Feed the locus information (e.g., chr8:118366813-118366928), which is added to the annotation of each string in the output fasta file with tag #Locus. For example: 21 | 22 | #Locus chr8:118366813-118366928 23 | 24 | -u : Feed a single representative unit of a putative tandem repeat in the input read. If it is not specified, uTR automatically estimates tandem repeat units of length 20 nt or less. Long units (of length > 20nt, for example) can be computed by using mTR: 25 | 26 | https://github.com/morisUtokyo/mTR 27 | 28 | See the details of mTR: 29 | 30 | https://academic.oup.com/bioinformatics/article/37/5/612/5919583 31 | 32 | -t : Output the wall clock time to process each read in the input fasta file. 33 | 34 | -a : Print the input annotation as it is 35 | 36 | > #Info (171,125,0.129) #Pat 1938 #Hap <(null)> #Decomp [2 (0,CCG,3,114,114) (1,AGC,3,57,57)] #Annotation 37 | 38 | -d : Do not print the decomposition. 39 | 40 | > #Info (171,125,0.129) #Pat 1938 41 | 42 | -r : Give a maximum threshold on the mismatch ratio between the representaitve unit and a tandem repeat of the unit. No tandem repeat is output if the mismatch ratio exceeds this threshold. The default parameter is 0.3, which is set in uTR.h by: 43 | 44 | >#define MAX_DIS_RATIO_DEFAULT 0.3 45 | 46 | -y : Produce a more complex pattern with fewer mismatches, while the default mode outputs a simpler pattern with more mismatches. The background to providing this mode is Occam's Razor on how to generate better decompositions; that is, a trade-off between simpler patterns with more mismatches and more complex patterns with fewer mismatches. For example, given a string with complex tandem repeats: 47 | 48 | CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAACAGCCGCCACCGCCGCCGCCGCCGCCGCCGCCTCCTCAGCTTCCTCAGCCGCCGCCGCAGGCACAGCCGCTGCTGCCTCAGCCGCAGCCGCCCCCGCCGCCGCCCCCCGCCGCCACC 49 | 50 | The default mode outputs a simple decomposition with a high dissimilarity rate of 12.9%. 51 | 52 | #Info (171,125,0.129) #Pat 1938 53 | 54 | Meanwhile, this mode “-y” produces a more complex decomposition with a smaller dissimilarity of 4.1%: 55 | 56 | #Info (171,125,0.041) #Pat 199214212128 57 | 58 | In the above decomposition, \1, \1, and \1, whose subscripts are 1, are not repeats of units but are substrings in the given underlying string and fully match the string. These substrings are surrounded by repeat units; for example, \1 by \2 and \4. 59 | 60 | -z : Output tandem repeat patterns with longer units. Two decompositions with shorter and longer units can match a given string with the same number of mismatches. For example, decompositions 61 | 62 | 323232 and 3 63 | 64 | fully match string: 65 | 66 | AGAGAGCTCTAGAGAGCTCTAGAGAGCTCT 67 | 68 | “-z” prioritizes a decomposition with longer units, say \3 in the above example. To implement this prioritization, our program optimizes the alignment score minus the number of units in the decomposition. From the string example below, which is used in the mode “-y”: 69 | 70 | CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAACAGCCGCCACCGCCGCCGCCGCCGCCGCCGCCTCCTCAGCTTCCTCAGCCGCCGCCGCAGGCACAGCCGCTGCTGCCTCAGCCGCAGCCGCCCCCGCCGCCGCCCCCCGCCGCCACC 71 | 72 | "-z" mode outputs 73 | 74 | #Info (171,125,0.099) #Pat 201088 75 | 76 | while the default mode outputs: 77 | 78 | #Info (171,125,0.129) #Pat 1938 79 | 80 | Note that the former pattern with long unit CCGCAG matches the given string with a smaller dissimilarity rate than the latter pattern with short units. 81 | 82 | 83 | ## Simulated datasets to show the accuracy of the code 84 | 85 | In the directory named "test_public", codes for generating simulated datasets and checking the accuracy of uTR and RepeatMasker are available in the following sub-directories: 86 | 87 | - gendata: Codes for generating simulated datasets with correct answer mosaic repeat patterns. 88 | - check_uTR: Codes for checking the accuracy of uTR according to the correct answer patterns. 89 | - parse_RepeatMasker: Codes for parsing the output of RepeaseMasker after entering the simulated datasets (named $TR_file) into RepeatMasker by issuing the command: "repeatmasker -e hmmer -noint -pa 4 -div 0 -xsmall $TR_file" 90 | - check_RepeatMasker: Codes for checking the accuracy of RepeatMasker. 91 | 92 | Use "make.sh" to compile all codes in the above subdirectories. 93 | 94 | Use "test.sh" to perform all the steps in a batch manner. The script reports the accuracy of uTR and RepeatMasker in "accuracy_uTR.txt" and "accuracy_RepeatMasker.txt". For example, the top six rows of accuracy_uTR.txt are: 95 | 96 | AC AG 97 | 2 20 1000 1000 42 98 | 2 50 1000 1000 104 99 | 2 100 1000 1000 209 100 | 2 200 1000 1000 403 101 | 2 500 998 1000 1011 102 | 103 | The first row shows that 1000 instances mosaic tandem repeats of the form (AC)m (AG)n are generated at random. In the second row, "2 20" implies that values of variables m and n ranged from 2 to 20, "1000 1000" means that 1000 of 1000 mosaic tandem repeat patterns were predicted correctly, the last "42" shows the average length of all patterns. Similarly, in the sixth row, "2 500" shows that m and n ranged from 2 to 500, "998 1000" means 998 of 1000 patterns were correctly estimated, and the last "1011" shows the average length. 104 | 105 | ## Real data 106 | 107 | The following read data are found in the directory: 108 | 109 | realdata/realdata.fasta 110 | 111 | The fasta file includes the following DNA sequences: 112 | 113 | - SAND12(control,BAFME) hg38_dna range=chr8:118366813-118366928 in the 4th intron of SAMD12, pattern=(AAAAT)23 114 | 115 | - SAND12(case,BAFME) Tandem repeat in the 4th intron of SAND12 found in Patient II-1 in family F6115 (Supplementary Figure 6, Ishiura, H. et al. Expansions of intronic TTTCA and TTTTA repeats in benign adult familial myoclonic epilepsy. Nat Genet 50, 581–590 (2018)) pattern=(ATTTT)221(ATTTC)221(ATTTT)82 116 | 117 | - SAND12(case,BAFME) Tandem repeat in the 4th intron of SAND12 found in (Supplementary Figure 6, Ishiura, H. et al. Expansions of intronic TTTCA and TTTTA repeats in benign adult familial myoclonic epilepsy. Nat Genet 50, 581–590 (2018)) pattern=(ATTTT)613(ATTTC)320(ATTTT)5(ATTTC)130 118 | 119 | - RFC1(control,CANVAS) hg38_dna range=chr4:39348425-39348483 pattern=(AAAAG)11 120 | 121 | - KAZN(control) hg38_dna range=chr1:14883297-14883426 pattern=(AAAG)6(AG)11(AAAG)20 122 | 123 | - ZNF37A(control) hg38_dna range=chr10:38112731-38112826 pattern=(CTTTT)12(CTTGT)3(CTTTT)2 124 | 125 | 126 | ## Handling a fasta file output by cTR 127 | 128 | To reduce computation time, it is reasonable to generate smaller input Fasta files. For this purpose, cTR is useful because it clusters reads into groups of similar reads and is available at: 129 | 130 | https://github.com/morisUtokyo/cTR 131 | 132 | cTR outputs the following information to the annotation of the representative read (with sampleID,readID) of each group, and uTR can feed the annotation. 133 | 134 | > GroupSize = N, Diameter = D, RadiusFromCentroid = R, CentroidReadName = sampleID,readID, CentroidReadLength = L 135 | 136 | For example: 137 | 138 | > GroupSize = 10, Diameter = 4, RadiusFromCentroid = 4, CentroidReadName = sampleID,readID, CentroidReadLength = 166 139 | 140 | uTR parses the above information and outputs the annotation: 141 | 142 | > #Info (166,10,2,0.01) #Pat 62523 #Decomp [2 (0,AAAG,4,116,116) (1,AG,2,50,50)] 143 | 144 | - #Info (166,10,2,0.01): 166 shows the length of the centroid, 10 is the number of elements in the group, 2 is the number of key repeat units in the centroid read, and 0.01 is the mismatch ratio between the read and the decomposition \6\25\23 of the above input string, which concatenates 6 copies of AAAG, 25 copies of AG, and 23 copies of AAAG. 145 | 146 | - #Decomp [2 (0,AAAG,4,116,116) (1,AG,2,50,50)]: The first 2 shows the number of repeat units. In tuple (0,AAAG,4,116,116), 0 is the identifier of the unit, AAAG is the string of the unit, 4 is the unit length, 116 is the total bases in the unit occurrences, and the last 116 the total bases in the tandem repeats of the unit. Similarly, in tuple (1,AG,2,50,50), 1 is the identifier, AG the unit string, 2 the length of AG, 50 the total bases and the last 50 the total bases in the tandem repeats of the unit. 147 | 148 | - At the end of the fasta file, all repeat units used in the decompositions of reads are appended. Each repeat unit is annotated with its occurrence frequency in reads. 149 | 150 | To process the above information associated with sampleID and readID, several input parameters can be used. 151 | 152 | uTR [-sx] [-i output table file] [-p output a summary statistics with TR patterns] [-h input haplotype file] 153 | 154 | -s : Print the pair of sample identifier ID and the name of the read (say, read1) collected from the sample, and the pair of ID and read1 is the centroid of a group of strings. If ID and read name are unavailable, NAs are output. 155 | 156 | > #Info (ID,read1,166,10,0.01) #Pat 62523 #Decomp [2 (0,AAAG,4,116,116) (1,AG,2,50,50)] 157 | 158 | -i : Output a table file in which each line shows, for example: 159 | 160 | nID,read1 (166,10,2,0.01) [2 (0,AAAG,4,116,116) (1,AG,2,50,50)] 161 | 162 | -x : Output the following summary statistics to the standard output. 163 | 164 | #haplotypes=100 (ID1,read1,245,80,2,0.18) [2 (0,CCG,3,221,221) (1,GT,2,24,24)] (ID2,read3,248,43,2,0.18) [2 (0,CCG,3,224,224) (1,GT,2,24,24)] ... 165 | 166 | -p : Output a summary statistics with tandem repeat patterns to the file name following "-p". It begins with the number of haplotypes with different tandem repeats and shows a list of tandem repeat patterns. For example, 167 | 168 | #haplotypes=1000 (166,10,0.01) 62523 ... 169 | 170 | -h : Feed SNV information surrounding a input TR in a read; namely, a list of tuples of the form sampleID, readID, and a pair of SNV positions closest to the focal TR (e.g., 14882386|14883645, where two positions are separated by the bar "|"). For each read, the pair of nearest SNVs is put into the annotation of the read with tag #Hap. 171 | 172 | 173 | ## Citation 174 | 175 | When using this software program, please cite: 176 | 177 | Masutani, Bansho, Kawahara, Riki & Morishita, Shinichi. Decomposing mosaic tandem repeats accurately from long reads. Bioinformatics 39, 1–6 (2023). 178 | 179 | https://academic.oup.com/bioinformatics/article/39/4/btad185/7114028 180 | -------------------------------------------------------------------------------- /coverage_by_long_units_nsop_Z.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | //#include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "uTR.h" 15 | using namespace std; 16 | 17 | class Alignment{ 18 | public: 19 | int start_x, start_y, end_x, end_y; 20 | float initial_score, score; 21 | string name; 22 | Alignment* predecessor; 23 | 24 | Alignment(int x0, int y0, int x1, int y1, float s, string n1){ 25 | start_x = x0; 26 | start_y = y0; 27 | end_x = x1; 28 | end_y = y1; 29 | initial_score = s; 30 | score = s; 31 | predecessor = 0; 32 | } 33 | void print()const{ 34 | //if(name.length() > 0) cout << name << "\t"; 35 | cout << "(" << start_x << "," << start_y << ")\t-> (" << 36 | end_x << "," << end_y << ")\t score = " << score << endl; 37 | } 38 | bool extend(char *S, int n, char *unit, int unitLen, int *covered, int MIN_number_repetitions ){ 39 | int x, y; 40 | int numMismatches = 0; 41 | int maxMismatches = ceil(unitLen * MAX_DIS_RATIO); 42 | int numCopies = 0; 43 | 44 | for(x=end_x, y=end_y; x maxMismatches) // If the number of mismatches exceed the threshold, stop extension. 50 | break; 51 | else{x++, y++;} 52 | } 53 | // Update the ends if the length is unitLen or more. 54 | if(unitLen <= x - end_x){ 55 | end_x = x; end_y = y; 56 | numMismatches = 0; 57 | numCopies++; 58 | } 59 | } 60 | // The unit must be duplicated. 61 | if(MIN_number_repetitions <= numCopies){ 62 | // Use the last portion if the number of mismatches <= threshold 63 | if(numMismatches <= maxMismatches){ 64 | end_x = x; end_y = y; 65 | } 66 | for(x=start_x, y=start_y; x set_of_alignments; 78 | for(int l=0; l ub || lb < 0 || n-1 < ub) break; 88 | } 89 | for(int k=lb; k<=ub; k++) 90 | // Insert an alignment staring from (SA[k],l) of length WINDOW_LEN. 91 | set_of_alignments.insert( new Alignment( SA[k], l, SA[k] + WINDOW_LEN, l + WINDOW_LEN, WINDOW_LEN, "") ); // Half-open intervals are used 92 | } 93 | // Seed extension in a greedy manner 94 | multimap sorted_by_X; 95 | for(set::iterator A = set_of_alignments.begin(); 96 | A != set_of_alignments.end();A++){ 97 | sorted_by_X.insert(make_pair((*A)->start_x,*A)); 98 | } 99 | int aln_start_x; // The start position of a new alignment 100 | int aln_next_y = 0; // The y coord of the new alignment must be closest to this. 101 | // Scan sorted_by_X in the ascending order 102 | for(multimap::iterator P = sorted_by_X.begin(); 103 | P != sorted_by_X.end(); ) 104 | { 105 | aln_start_x = P->second->start_x; // The start position of a new alignment 106 | // Find the seed alignment closest to aln_next_y 107 | auto seedP = P; // Candidate of the seed 108 | int min_distance_P = (seedP->second->start_y - aln_next_y) % unitLen; 109 | auto nextP = P; nextP++; // Iterator 110 | while(nextP != sorted_by_X.end() && nextP->second->start_x == aln_start_x){ 111 | int distance_nextP = (nextP->second->start_y - aln_next_y) % unitLen; 112 | if(distance_nextP < min_distance_P){ 113 | min_distance_P = distance_nextP; 114 | seedP = nextP; 115 | } 116 | nextP++; 117 | } 118 | // Extend the seed alignment 119 | if( seedP->second->extend(S, n, unit, unitLen, covered, MIN_number_repetitions ) ){ 120 | aln_next_y = seedP->second->end_y; 121 | while( P != sorted_by_X.end() ){ 122 | if( P->second->start_x <= seedP->second->end_x ) 123 | P++; 124 | else 125 | break; 126 | } 127 | }else 128 | P++; // Move to the next 129 | } 130 | // Delete all alignments from set_of_alignments 131 | for(set::iterator iter = set_of_alignments.begin(); iter != set_of_alignments.end(); iter++){ 132 | delete *iter; 133 | } 134 | } 135 | 136 | 137 | //------------------------------------------------------------------- 138 | // 139 | // Riki Kawahara has the copyright of the following program. 140 | // Shinichi Morishita added an API on June 28, 2021. 141 | // 142 | //------------------------------------------------------------------- 143 | 144 | void induced_sort(vector &vec, int val_range, vector &sa, 145 | vector &sl, vector &lms_idx) { 146 | vector l(val_range, 0), r(val_range, 0); 147 | for (int c : vec) { 148 | if (c + 1 < val_range) ++l[c + 1]; 149 | ++r[c]; 150 | } 151 | partial_sum(l.begin(), l.end(), l.begin()); 152 | partial_sum(r.begin(), r.end(), r.begin()); 153 | 154 | fill(sa.begin(), sa.end(), -1); 155 | 156 | for (int i = lms_idx.size() - 1; i >= 0; --i) { 157 | sa[--r[vec[lms_idx[i]]]] = lms_idx[i]; 158 | } 159 | 160 | for (int i : sa) 161 | if (i >= 1 && sl[i - 1]) { 162 | sa[l[vec[i - 1]]++] = i - 1; 163 | } 164 | 165 | fill(r.begin(), r.end(), 0); 166 | for (int c : vec) ++r[c]; 167 | partial_sum(r.begin(), r.end(), r.begin()); 168 | for (int k = sa.size() - 1, i = sa[k]; k >= 1; --k, i = sa[k]) 169 | if (i >= 1 && !sl[i - 1]) { 170 | sa[--r[vec[i - 1]]] = i - 1; 171 | } 172 | } 173 | 174 | vector sa_is(vector &vec, int val_range) { 175 | const int n = vec.size(); 176 | vector sa(n), lms_idx; 177 | vector sl(n); 178 | 179 | sl[n - 1] = false; 180 | for (int i = n - 2; i >= 0; --i) { 181 | sl[i] = (vec[i] > vec[i + 1] || (vec[i] == vec[i + 1] && sl[i + 1])); 182 | if (sl[i] && !sl[i + 1]) lms_idx.push_back(i + 1); 183 | } 184 | reverse(lms_idx.begin(), lms_idx.end()); 185 | 186 | induced_sort(vec, val_range, sa, sl, lms_idx); 187 | 188 | vector new_lms_idx(lms_idx.size()), lms_vec(lms_idx.size()); 189 | for (int i = 0, k = 0; i < n; ++i) 190 | if (!sl[sa[i]] && sa[i] >= 1 && sl[sa[i] - 1]) { 191 | new_lms_idx[k++] = sa[i]; 192 | } 193 | 194 | int cur = 0; 195 | sa[n - 1] = cur; 196 | for (size_t k = 1; k < new_lms_idx.size(); ++k) { 197 | int i = new_lms_idx[k - 1], j = new_lms_idx[k]; 198 | if (vec[i] != vec[j]) { 199 | sa[j] = ++cur; 200 | continue; 201 | } 202 | bool flag = false; 203 | for (int a = i + 1, b = j + 1;; ++a, ++b) { 204 | if (vec[a] != vec[b]) { 205 | flag = true; 206 | break; 207 | } 208 | if ((!sl[a] && sl[a - 1]) || (!sl[b] && sl[b - 1])) { 209 | flag = !((!sl[a] && sl[a - 1]) && (!sl[b] && sl[b - 1])); 210 | break; 211 | } 212 | } 213 | sa[j] = (flag ? ++cur : cur); 214 | } 215 | for (size_t i = 0; i < lms_idx.size(); ++i) { 216 | lms_vec[i] = sa[lms_idx[i]]; 217 | } 218 | 219 | if (cur + 1 < (int)lms_idx.size()) { 220 | auto lms_sa = sa_is(lms_vec, cur + 1); 221 | 222 | for (size_t i = 0; i < lms_idx.size(); ++i) { 223 | new_lms_idx[i] = lms_idx[lms_sa[i]]; 224 | } 225 | } 226 | 227 | induced_sort(vec, val_range, sa, sl, new_lms_idx); 228 | 229 | return sa; 230 | } 231 | 232 | vector suffix_array(string s) { 233 | s += '$'; 234 | vector vec(s.size()); 235 | for (int i = 0; i < (int)s.size(); ++i) vec[i] = s[i]; 236 | auto sa = sa_is(vec, 128); 237 | sa.erase(sa.begin()); 238 | return sa; 239 | } 240 | 241 | vector lcp_array(string &s, vector &sa) { 242 | int n = s.size(); 243 | vector rnk(n); 244 | for (int i = 0; i < n; ++i) rnk[sa[i]] = i; 245 | vector lcp(n); //lcp(n - 1); 246 | int h = 0; 247 | for (int i = 0; i < n; ++i) { 248 | if (h > 0) --h; 249 | if (rnk[i] == 0) continue; 250 | for (int j = sa[rnk[i] - 1]; j + h < n && i + h < n; ++h) { 251 | if (s[i + h] != s[j + h]) break; 252 | } 253 | lcp[rnk[i] - 1] = h; 254 | } 255 | return lcp; 256 | } 257 | 258 | void dump_int_array(int *a, int len, string name){ 259 | cout << name << "\t"; 260 | for(int i=0; i lcp[i]){ 314 | if(up_state == 1){ 315 | // Position max_i is locally maximum. 316 | // Search non-self-overlapping substrings. 317 | nsop( s.substr(sa[max_i], lcp[max_i]), sa[max_i] ); 318 | up_state = 0; 319 | } 320 | max_lcp = lcp[i]; 321 | max_i = i; 322 | }else{ 323 | // max_lcp == lcp[i] 324 | // Untouch up_state, max_lcp, and max_i 325 | } 326 | } 327 | } 328 | 329 | --------------------------------------------------------------------------------