├── tests ├── input │ ├── empty.txt │ ├── duplicate_rsids.csv │ ├── ancestry_mt.txt │ ├── chromosomes.csv │ ├── generic_no_header.tsv │ ├── GRCh37.csv │ ├── GRCh38.csv │ ├── NCBI36.csv │ ├── GRCh37_PAR.csv │ ├── generic.fa │ ├── generic.csv │ ├── generic.tsv │ ├── tellmeGen.txt │ ├── generic_header_comment.tsv │ ├── generic_multi_rsid.tsv │ ├── generic_non_standard_columns.tsv │ ├── generic_extra_column.tsv │ ├── ftdna.csv │ ├── codigo46.txt │ ├── ftdna_famfinder.csv │ ├── circledna.txt │ ├── DNALand.txt │ ├── genesforgood.txt │ ├── myheritage.csv │ ├── livingdna.csv │ ├── sano.txt │ ├── 23andme_allele.txt │ ├── 23andme.txt │ ├── ancestry.txt │ ├── ancestry_multi_sep.txt │ ├── 23andme_win.txt │ ├── myheritage_extra_quotes.csv │ ├── unannotated_testvcf.vcf │ ├── testvcf.vcf │ ├── testvcf_phased.vcf │ ├── testvcf_chr_prefix.vcf │ ├── testvcf_multi_sample.vcf │ ├── mapmygenome_alt_header.txt │ ├── mapmygenome.txt │ └── discrepant_snps.csv ├── resources │ ├── dbsnp_151_37_reverse.txt │ ├── gsa_rsid_map.txt │ └── gsa_chrpos_map.txt ├── IndividualTest.php ├── SNPsTest.php └── Snps │ ├── IO │ ├── ReaderTest.php │ └── WriterTes.php │ ├── SnpsMergeTest.php │ └── SnpsTest.php ├── .github ├── FUNDING.yml └── ISSUE_TEMPLATE │ └── sweep-template.yml ├── .gitignore ├── .vscode └── settings.json ├── index.php ├── resources └── fasta │ └── GRCh37 │ └── Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz ├── rector.php ├── src ├── Snps │ ├── IO │ │ ├── IO.php │ │ ├── ExtraTabsFilter.php │ │ ├── AdditionalFile.php │ │ ├── SnpFileReader.php │ │ ├── PythonDependency.php │ │ ├── DataParser.php │ │ ├── CsvReader.php │ │ ├── PhpDataFrame.php │ │ └── Writer.php │ ├── Singleton.php │ ├── Analysis │ │ ├── BuildDetector.php │ │ └── ClusterOverlapCalculator.php │ ├── SNPData.php │ ├── Resources.php │ ├── SortTest.php │ ├── ReferenceSequence.php │ ├── SNPAnalyzer.php │ ├── Ensembl.php │ ├── ReferenceSequenceManager.php │ ├── DatasetDownloader.php │ ├── DocBlockChecker.php │ ├── AssemblyMappingManager.php │ ├── Utils.php │ ├── EnsemblRestClient.php │ ├── PythonDependency.php │ └── VariedicInherit.php ├── Utils │ └── ColorSchemeGenerator.php ├── KitLoader.php ├── Helpers │ └── CSVGenerator.php ├── Dna.php ├── Individual.php ├── Triangulation.php ├── Visualization.php ├── MatchKits.php └── Resources.php ├── phpunit.xml ├── .travis.yml ├── CONTRIBUTE.md ├── composer.json ├── LICENSE ├── sweep.yaml ├── README.md └── phpconvcount.py /tests/input/empty.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: liberu-genealogy 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | vendor/* 2 | .idea/* 3 | tmp/* 4 | .phpunit.result.cache -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "cmake.configureOnOpen": false 3 | } -------------------------------------------------------------------------------- /index.php: -------------------------------------------------------------------------------- 1 | generic test sequence:1:1:117 2 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 3 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGCCGGACNNNNNNNN 4 | -------------------------------------------------------------------------------- /resources/fasta/GRCh37/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liberu-genealogy/php-dna/HEAD/resources/fasta/GRCh37/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz -------------------------------------------------------------------------------- /tests/input/generic.csv: -------------------------------------------------------------------------------- 1 | rsid,chromosome,position,genotype 2 | rs1,1,101,AA 3 | rs2,1,102,CC 4 | rs3,1,103,GG 5 | rs4,1,104,TT 6 | rs5,1,105,-- 7 | rs6,1,106,GC 8 | rs7,1,107,TC 9 | rs8,1,108,AT 10 | -------------------------------------------------------------------------------- /tests/input/generic.tsv: -------------------------------------------------------------------------------- 1 | rsid chromosome position genotype 2 | rs1 1 101 AA 3 | rs2 1 102 CC 4 | rs3 1 103 GG 5 | rs4 1 104 TT 6 | rs5 1 105 -- 7 | rs6 1 106 GC 8 | rs7 1 107 TC 9 | rs8 1 108 AT 10 | -------------------------------------------------------------------------------- /tests/input/tellmeGen.txt: -------------------------------------------------------------------------------- 1 | rsid Chromosome position genotype 2 | 1:101 1 101 AA 3 | 1:102 1 102 CC 4 | 1:103 1 103 GG 5 | 1:104 1 104 TT 6 | 1:105 1 105 -- 7 | 1:106 1 106 GC 8 | 1:107 1 107 TC 9 | 1:108 1 108 AT -------------------------------------------------------------------------------- /tests/input/generic_header_comment.tsv: -------------------------------------------------------------------------------- 1 | # rsid chromosome position genotype 2 | rs1 1 101 AA 3 | rs2 1 102 CC 4 | rs3 1 103 GG 5 | rs4 1 104 TT 6 | rs5 1 105 -- 7 | rs6 1 106 GC 8 | rs7 1 107 TC 9 | rs8 1 108 AT 10 | -------------------------------------------------------------------------------- /tests/input/generic_multi_rsid.tsv: -------------------------------------------------------------------------------- 1 | rsid chromosome position genotype 2 | rs1 1 101 AA 3 | rs2 1 102 CC 4 | rs3 1 103 GG 5 | rs4 1 104 TT 6 | rs5 1 105 -- 7 | rs6,rs9 1 106 GC 8 | rs7 1 107 TC 9 | rs8 1 108 AT 10 | -------------------------------------------------------------------------------- /tests/input/generic_non_standard_columns.tsv: -------------------------------------------------------------------------------- 1 | rsid chromosome position genotype_other 2 | rs1 1 101 AA 3 | rs2 1 102 CC 4 | rs3 1 103 GG 5 | rs4 1 104 TT 6 | rs5 1 105 -- 7 | rs6 1 106 GC 8 | rs7 1 107 TC 9 | rs8 1 108 AT 10 | -------------------------------------------------------------------------------- /tests/input/generic_extra_column.tsv: -------------------------------------------------------------------------------- 1 | rsid chromosome position genotype extra 2 | rs1 1 101 AA 1 3 | rs2 1 102 CC 1 4 | rs3 1 103 GG 2 5 | rs4 1 104 TT 2 6 | rs5 1 105 -- 2 7 | rs6 1 106 GC 2 8 | rs7 1 107 TC 2 9 | rs8 1 108 AT 2 10 | -------------------------------------------------------------------------------- /tests/resources/gsa_chrpos_map.txt: -------------------------------------------------------------------------------- 1 | Name Chr MapInfo deCODE(cM) 2 | 1:101 1 101 0.0000 3 | 1:102 1 102 0.0000 4 | 1:103 1 103 0.0000 5 | 1:104 1 104 0.0000 6 | 1:105 1 105 0.0000 7 | rs6 1 106 0.0000 8 | rs7 1 107 0.0000 9 | rs8 1 108 0.0000 10 | -------------------------------------------------------------------------------- /tests/input/ftdna.csv: -------------------------------------------------------------------------------- 1 | RSID,CHROMOSOME,POSITION,RESULT 2 | "rs1","1","101","AA" 3 | "rs2","1","102","CC" 4 | "rs3","1","103","GG" 5 | "rs4","1","104","TT" 6 | "rs5","1","105","--" 7 | "rs6","1","106","GC" 8 | "rs7","1","107","TC" 9 | "rs8","1","108","AT" 10 | -------------------------------------------------------------------------------- /tests/input/codigo46.txt: -------------------------------------------------------------------------------- 1 | [Header] 2 | Content CODIGO46.bpm 3 | [Data] 4 | Sample Name SNP Name Allele1 - Plus Allele2 - Plus 5 | 123 1:101 A A 6 | 123 1:102 C C 7 | 123 1:103 G G 8 | 123 1:104 T T 9 | 123 1:105 - - 10 | 123 rs6 G C 11 | 123 rs7 T C 12 | 123 rs8 A T -------------------------------------------------------------------------------- /rector.php: -------------------------------------------------------------------------------- 1 | sets([LevelSetList::UP_TO_PHP_82]); 10 | }; 11 | -------------------------------------------------------------------------------- /tests/input/ftdna_famfinder.csv: -------------------------------------------------------------------------------- 1 | # famfinder, https://www.familytreedna.com 2 | # 3 | # name,chromosome,position,allele1,allele2 4 | rs1,1,101,A,A 5 | rs2,1,102,C,C 6 | rs3,1,103,G,G 7 | rs4,1,104,T,T 8 | rs5,1,105,-,- 9 | rs6,1,106,G,C 10 | rs7,1,107,T,C 11 | rs8,1,108,A,T 12 | -------------------------------------------------------------------------------- /tests/input/circledna.txt: -------------------------------------------------------------------------------- 1 | # Circle 2 | # 3 | # 4 | # MARKERNAME CHROM POS GT 5 | chr1:1:A chr1 1 A/A 6 | rs1 chr1 101 A/A 7 | rs2 chr1 102 C/C 8 | rs3 chr1 103 G/G 9 | rs4 chr1 104 T/T 10 | rs6 chr1 106 G/C 11 | rs7 chr1 107 T/C 12 | rs8 chr1 108 A/T 13 | chr1:1001:A chr1 1001 A/A 14 | -------------------------------------------------------------------------------- /tests/input/DNALand.txt: -------------------------------------------------------------------------------- 1 | # DNA.Land 2 | # 3 | # 4 | # 5 | # 6 | # 7 | # 8 | # 9 | # 10 | # 11 | # 12 | # 13 | # 14 | # 15 | # rsid chromosome position genotype 16 | rs1 1 101 AA 17 | rs2 1 102 CC 18 | rs3 1 103 GG 19 | rs4 1 104 TT 20 | rs5 1 105 -- 21 | rs6 1 106 GC 22 | rs7 1 107 TC 23 | rs8 1 108 AT -------------------------------------------------------------------------------- /tests/input/genesforgood.txt: -------------------------------------------------------------------------------- 1 | # Genes for Good 2 | # 3 | # 4 | # 5 | # 6 | # 7 | # 8 | # 9 | # 10 | # rsid chromosome position genotype 11 | rs1 1 101 AA 12 | rs2 1 102 CC 13 | rs3 1 103 GG 14 | rs4 1 104 TT 15 | rs5 1 105 -- 16 | rs6 1 106 GC 17 | rs7 1 107 TC 18 | rs8 1 108 AT 19 | -------------------------------------------------------------------------------- /tests/input/myheritage.csv: -------------------------------------------------------------------------------- 1 | # MyHeritage, https://www.myheritage.com 2 | RSID,CHROMOSOME,POSITION,RESULT 3 | "rs1","1","101","AA" 4 | "rs2","1","102","CC" 5 | "rs3","1","103","GG" 6 | "rs4","1","104","TT" 7 | "rs5","1","105","--" 8 | "rs6","1","106","GC" 9 | "rs7","1","107","TC" 10 | "rs8","1","108","AT" 11 | -------------------------------------------------------------------------------- /tests/input/livingdna.csv: -------------------------------------------------------------------------------- 1 | # Living DNA 2 | # 3 | # 4 | # 5 | # 6 | # 7 | # 8 | # 9 | # 10 | # 11 | # 12 | # 13 | # 14 | # 15 | # rsid chromosome position genotype 16 | rs1 1 101 AA 17 | rs2 1 102 CC 18 | rs3 1 103 GG 19 | rs4 1 104 TT 20 | rs5 1 105 -- 21 | rs6 1 106 GC 22 | rs7 1 107 TC 23 | rs8 1 108 AT 24 | -------------------------------------------------------------------------------- /tests/input/sano.txt: -------------------------------------------------------------------------------- 1 | [Header] 2 | Content SANO 3 | [Data] 4 | Sample Name SNP Name Chr Position Allele1 - Forward Allele2 - Forward 5 | 123 1:101 1 101 A A 6 | 123 1:102 1 102 C C 7 | 123 1:103 1 103 G G 8 | 123 1:104 1 104 T T 9 | 123 1:105 1 105 - - 10 | 123 rs6 1 106 G C 11 | 123 rs7 1 107 A G 12 | 123 rs8 1 108 T A -------------------------------------------------------------------------------- /tests/input/23andme_allele.txt: -------------------------------------------------------------------------------- 1 | # 23andMe 2 | # 3 | # 4 | # 5 | # 6 | # 7 | # 8 | # 9 | # 10 | # 11 | # 12 | # 13 | # 14 | # 15 | # rsid chromosome position allele1 allele2 16 | rs1 1 101 A A 17 | rs2 1 102 C C 18 | rs3 1 103 G G 19 | rs4 1 104 T T 20 | rs5 1 105 - - 21 | rs6 1 106 G C 22 | rs7 1 107 T C 23 | rs8 1 108 A T 24 | -------------------------------------------------------------------------------- /tests/input/23andme.txt: -------------------------------------------------------------------------------- 1 | # 23andMe 2 | # 3 | # 4 | # 5 | # 6 | # 7 | # 8 | # 9 | # 10 | # 11 | # 12 | # 13 | # 14 | # 15 | # rsid chromosome position genotype 16 | rs1 1 101 AA 17 | rs2 1 102 CC 18 | rs3 1 103 GG 19 | rs4 1 104 TT 20 | rs5 1 105 -- 21 | rs6 1 106 GC 22 | rs7 1 107 TC 23 | rs8 1 108 AT 24 | rs9 -- 109 AT 25 | rs10 -- -- AT 26 | -------------------------------------------------------------------------------- /tests/input/ancestry.txt: -------------------------------------------------------------------------------- 1 | #AncestryDNA 2 | # 3 | # 4 | # 5 | # 6 | # 7 | # 8 | # 9 | # 10 | # 11 | # 12 | # 13 | # 14 | # 15 | # 16 | # 17 | # 18 | # 19 | rsid chromosome position allele1 allele2 20 | rs1 1 101 A A 21 | rs2 1 102 C C 22 | rs3 1 103 G G 23 | rs4 1 104 T T 24 | rs5 1 105 0 0 25 | rs6 1 106 G C 26 | rs7 1 107 T C 27 | rs8 1 108 A T 28 | -------------------------------------------------------------------------------- /src/Snps/IO/IO.php: -------------------------------------------------------------------------------- 1 | null, "chrom" => null, "pos" => null, "genotype" => null); 10 | // $df = array(); 11 | // $df[] = $columns; 12 | return []; 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /tests/input/ancestry_multi_sep.txt: -------------------------------------------------------------------------------- 1 | #AncestryDNA 2 | # 3 | # 4 | # 5 | # 6 | # 7 | # 8 | # 9 | # 10 | # 11 | # 12 | # 13 | # 14 | # 15 | # 16 | # 17 | # 18 | # 19 | rsid chromosome position allele1 allele2 20 | rs1 1 101 A A 21 | rs2 1 102 C C 22 | rs3 1 103 G G 23 | rs4 1 104 T T 24 | rs5 1 105 0 0 25 | rs6 1 106 G C 26 | rs7 1 107 T C 27 | rs8 1 108 A T 28 | -------------------------------------------------------------------------------- /tests/input/23andme_win.txt: -------------------------------------------------------------------------------- 1 | # 23andMe 2 | # 3 | # 4 | # 5 | # 6 | # 7 | # 8 | # 9 | # 10 | # 11 | # 12 | # 13 | # 14 | # 15 | # rsid chromosome position genotype 16 | rs1 1 101 AA 17 | rs2 1 102 CC 18 | rs3 1 103 GG 19 | rs4 1 104 TT 20 | rs5 1 105 -- 21 | rs6 1 106 GC 22 | rs7 1 107 TC 23 | rs8 1 108 AT 24 | rs9 -- 109 AT 25 | rs10 -- -- AT 26 | -------------------------------------------------------------------------------- /tests/input/myheritage_extra_quotes.csv: -------------------------------------------------------------------------------- 1 | # MyHeritage, https://www.myheritage.com 2 | RSID,CHROMOSOME,POSITION,RESULT 3 | "rs1"",""1"",""101"",""AA" 4 | "rs2"",""1"",""102"",""CC" 5 | "rs3"",""1"",""103"",""GG" 6 | "rs4"",""1"",""104"",""TT" 7 | "rs5"",""1"",""105"",""--" 8 | "rs6"",""1"",""106"",""GC" 9 | "rs7"",""1"",""107"",""TC" 10 | "rs8"",""1"",""108"",""AT" 11 | -------------------------------------------------------------------------------- /phpunit.xml: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | 7 | ./tests 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: php 2 | 3 | php: 4 | - 5.6 5 | - 5.5 6 | - 5.4 7 | - 7 8 | - hhvm 9 | - hhvm-nightly 10 | 11 | matrix: 12 | fast_finish: true 13 | allow_failures: 14 | - php: 7 15 | - php: hhvm 16 | - php: hhvm-nightly 17 | 18 | before_script: 19 | - composer install --prefer-dist --dev 20 | 21 | script: 22 | - vendor/bin/phpunit -c tests/phpunit.xml tests/ 23 | - vendor/bin/phpcs --standard=PSR2 -n library/ tests/ 24 | -------------------------------------------------------------------------------- /src/Utils/ColorSchemeGenerator.php: -------------------------------------------------------------------------------- 1 | namespace src\Utils; 2 | 3 | class ColorSchemeGenerator { 4 | public static function generate($numColors) { 5 | if ($numColors < 1) { 6 | return []; 7 | } 8 | $colors = []; 9 | for ($i = 0; $i < $numColors; $i++) { 10 | $hue = ($i * 360 / $numColors) % 360; 11 | $colors[] = "hsl(" . $hue . ", 100%, 50%)"; 12 | } 13 | return $colors; 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /tests/input/unannotated_testvcf.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##fileDate=20190527 3 | ## 4 | ## 5 | ##FORMAT= 6 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLEID 7 | 1 101 . A G . . . GT 0/0 8 | 1 102 . G C . . . GT 1/1 9 | 1 103 . G T . . . GT 0/0 10 | 1 104 . C T . . . GT 1/1 11 | 1 105 . C . . . . GT ./. 12 | 1 106 . G C . . . GT 0/1 13 | 1 107 . G T,C . . . GT 1/2 14 | 1 108 . A T . . . GT 0/1 15 | 1 109 . C T . . . GT 0/1 16 | -------------------------------------------------------------------------------- /src/KitLoader.php: -------------------------------------------------------------------------------- 1 | 6 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLEID 7 | 1 101 rs1 A . . . . GT 0/0 8 | 1 102 rs2 . C . . . GT 1/1 9 | 1 103 rs3 G T . . . GT 0|0 10 | 1 104 rs4 C T . . . GT 1/1 11 | 1 105 rs5 C . . . . GT ./. 12 | 1 106 rs6 G C . . . GT 0/1 13 | 1 107 rs7 G T,C . . . GT 1/2 14 | 1 108 rs8 A T . . . GT 0/1 15 | 1 109 . C T . . . GT 0/1 16 | 1 110 rs10 A AGC . . . GT 0/1 17 | 1 111 rs11 AGC A . . . GT 0/1 18 | 1 112 rs12 . A . . . GT 0/1 19 | 1 113 rs13 . A . . . GT 1/0 20 | 1 114 rs14 A . . . . GT 0/1 21 | 1 115 rs15 A . . . . GT 1/0 22 | 1 116 rs16 A A . . . GT 0/. 23 | 1 117 rs17 A A . . . GT ./0 24 | -------------------------------------------------------------------------------- /src/Helpers/CSVGenerator.php: -------------------------------------------------------------------------------- 1 | 6 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLEID 7 | 1 101 rs1 A . . . . GT 0|0 8 | 1 102 rs2 . C . . . GT 1|1 9 | 1 103 rs3 G T . . . GT 0|0 10 | 1 104 rs4 C T . . . GT 1|1 11 | 1 105 rs5 C . . . . GT ./. 12 | 1 106 rs6 G C . . . GT 0|1 13 | 1 107 rs7 G T,C . . . GT 1|2 14 | 1 108 rs8 A T . . . GT 0|1 15 | 1 109 . C T . . . GT 0|1 16 | 1 110 rs10 A AGC . . . GT 0|1 17 | 1 111 rs11 AGC A . . . GT 0|1 18 | 1 112 rs12 . A . . . GT 0|1 19 | 1 113 rs13 . A . . . GT 1|0 20 | 1 114 rs14 A . . . . GT 0|1 21 | 1 115 rs15 A . . . . GT 1|0 22 | 1 116 rs16 A A . . . GT 0|. 23 | 1 117 rs17 A A . . . GT .|0 24 | -------------------------------------------------------------------------------- /tests/input/testvcf_chr_prefix.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##fileDate=20190527 3 | ## 4 | ## 5 | ##FORMAT= 6 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLEID 7 | chr1 101 rs1 A . . . . GT 0/0 8 | chr1 102 rs2 . C . . . GT 1/1 9 | chr1 103 rs3 G T . . . GT 0|0 10 | chr1 104 rs4 C T . . . GT 1/1 11 | chr1 105 rs5 C . . . . GT ./. 12 | chr1 106 rs6 G C . . . GT 0/1 13 | chr1 107 rs7 G T,C . . . GT 1/2 14 | chr1 108 rs8 A T . . . GT 0/1 15 | chr1 109 . C T . . . GT 0/1 16 | chr1 110 rs10 A AGC . . . GT 0/1 17 | chr1 111 rs11 AGC A . . . GT 0/1 18 | chr1 112 rs12 . A . . . GT 0/1 19 | chr1 113 rs13 . A . . . GT 1/0 20 | chr1 114 rs14 A . . . . GT 0/1 21 | chr1 115 rs15 A . . . . GT 1/0 22 | chr1 116 rs16 A A . . . GT 0/. 23 | chr1 117 rs17 A A . . . GT ./0 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/sweep-template.yml: -------------------------------------------------------------------------------- 1 | name: Sweep Issue 2 | title: 'Sweep: ' 3 | description: For small bugs, features, refactors, and tests to be handled by Sweep, an AI-powered junior developer. 4 | labels: sweep 5 | body: 6 | - type: textarea 7 | id: description 8 | attributes: 9 | label: Details 10 | description: Tell Sweep where and what to edit and provide enough context for a new developer to the codebase 11 | placeholder: | 12 | Unit Tests: Write unit tests for . Test each function in the file. Make sure to test edge cases. 13 | Bugs: The bug might be in . Here are the logs: ... 14 | Features: the new endpoint should use the ... class from because it contains ... logic. 15 | Refactors: We are migrating this function to ... version because ... -------------------------------------------------------------------------------- /tests/input/testvcf_multi_sample.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##fileDate=20190527 3 | ## 4 | ## 5 | ##FORMAT= 6 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLEID MULTISAMPLE 7 | 1 101 rs1 A . . . . GT 0/0 ./. 8 | 1 102 rs2 . C . . . GT 1/1 ./. 9 | 1 103 rs3 G T . . . GT 0|0 ./. 10 | 1 104 rs4 C T . . . GT 1/1 ./. 11 | 1 105 rs5 C . . . . GT ./. ./. 12 | 1 106 rs6 G C . . . GT 0/1 ./. 13 | 1 107 rs7 G T,C . . . GT 1/2 ./. 14 | 1 108 rs8 A T . . . GT 0/1 ./. 15 | 1 109 . C T . . . GT 0/1 ./. 16 | 1 110 rs10 A AGC . . . GT 0/1 ./. 17 | 1 111 rs11 AGC A . . . GT 0/1 ./. 18 | 1 112 rs12 . A . . . GT 0/1 ./. 19 | 1 113 rs13 . A . . . GT 1/0 ./. 20 | 1 114 rs14 A . . . . GT 0/1 ./. 21 | 1 115 rs15 A . . . . GT 1/0 ./. 22 | 1 116 rs16 A A . . . GT 0/. ./. 23 | 1 117 rs17 A A . . . GT ./0 ./. 24 | -------------------------------------------------------------------------------- /src/Snps/IO/ExtraTabsFilter.php: -------------------------------------------------------------------------------- 1 | data = preg_replace('/\t+/', "\t", $bucket->data); 19 | 20 | // Remove trailing tabs at end of lines 21 | $bucket->data = preg_replace('/\t+\n/', "\n", $bucket->data); 22 | $bucket->data = preg_replace('/\t+\r\n/', "\r\n", $bucket->data); 23 | 24 | $consumed += $bucket->datalen; 25 | stream_bucket_append($out, $bucket); 26 | } 27 | 28 | return PSFS_PASS_ON; 29 | } 30 | } -------------------------------------------------------------------------------- /CONTRIBUTE.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Contributions are **welcome** and will be fully **credited**. We accept contributions via Pull Requests on [Github](https://github.com/familytree365/php-dna). 4 | 5 | ## Pull Requests 6 | 7 | - **[PSR-4 Coding Standard.]** The easiest way to apply the conventions is to install [PHP CS Fixer](https://github.com/FriendsOfPHP/PHP-CS-Fixer). 8 | - **Document any change in behaviour.** Make sure the `README.md` and any other relevant documentation are kept up-to-date. 9 | - **Create feature branches.** Don't ask us to pull from your master branch. 10 | - **One pull request per feature.** If you want to do more than one thing, send multiple pull requests. 11 | - **Send coherent history.** Make sure each individual commit in your pull request is meaningful. If you had to make multiple intermediate commits while developing, please [squash them](http://www.git-scm.com/book/en/v2/Git-Tools-Rewriting-History#Changing-Multiple-Commit-Messages) before submitting. -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "liberu-genealogy/php-dna", 3 | "description": "DNA analysis toolkit for PHP 8.4+", 4 | "type": "library", 5 | "keywords": ["dna", "genotype", "genealogy", "bioinformatics"], 6 | "homepage": "http://github.com/liberu-genealogy/php-dna", 7 | "license": "MIT", 8 | "require": { 9 | "php": "^8.4", 10 | "league/csv": "^9.0", 11 | "guzzlehttp/guzzle": "^7.8", 12 | "symfony/http-client": "^7.0", 13 | "ext-json": "*", 14 | "ext-zip": "*" 15 | }, 16 | "require-dev": { 17 | "phpunit/phpunit": "^11.0", 18 | "squizlabs/php_codesniffer": "^3.8", 19 | "rector/rector": "^1.0", 20 | "phpstan/phpstan": "^1.10" 21 | }, 22 | "autoload": { 23 | "psr-4": { 24 | "Dna\\": "src/" 25 | } 26 | }, 27 | "autoload-dev": { 28 | "psr-4": { 29 | "DnaTest\\": "tests/" 30 | } 31 | }, 32 | "config": { 33 | "sort-packages": true 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/Snps/Singleton.php: -------------------------------------------------------------------------------- 1 | parseFromText($fileContent); 17 | } 18 | 19 | public function parseFromText(string $rawData): SNPs 20 | { 21 | $snps = new SNPs(); 22 | $lines = explode("\n", $rawData); 23 | foreach ($lines as $line) { 24 | if (empty($line) || $line[0] === '#') { 25 | continue; 26 | } 27 | $parts = explode("\t", $line); 28 | if (count($parts) < 3) { 29 | continue; 30 | } 31 | [$chromosome, $position, $genotype] = $parts; 32 | $snps->addSNP($chromosome, $position, $genotype); 33 | } 34 | return $snps; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Family Tree 365 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/input/mapmygenome_alt_header.txt: -------------------------------------------------------------------------------- 1 | SNP.Name Sample.ID Allele1...Top Allele2...Top GC.Score Sample.Name Sample.Group Sample.Index SNP.Index SNP.Aux Allele1...Forward Allele2...Forward Allele1...Design Allele2...Design Allele1...AB Allele2...AB Allele1...Plus Allele2...Plus Chr Position GT.Score Cluster.Sep SNP ILMN.Strand Customer.Strand Top.Genomic.Sequence Plus.Minus.Strand Theta R X Y X.Raw Y.Raw B.Allele.Freq Log.R.Ratio CNV.Value CNV.Confidence 2 | rs1 0 A A 0 NA NA 0 0 0 A A A A A A A A 1 101 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA 3 | rs2 0 C C 0 NA NA 0 0 0 A A A A A A C C 1 102 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA 4 | rs3 0 G G 0 NA NA 0 0 0 A A A A A A G G 1 103 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA 5 | rs4 0 T T 0 NA NA 0 0 0 A A A A A A T T 1 104 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA 6 | rs5 0 -- -- 0 NA NA 0 0 0 A A A A A A -- -- 1 105 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA 7 | rs6 0 G C 0 NA NA 0 0 0 A A A A A A G C 1 106 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA 8 | rs7 0 T C 0 NA NA 0 0 0 A A A A A A T C 1 107 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA 9 | rs8 0 A T 0 NA NA 0 0 0 A A A A A A A T 1 108 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA 10 | -------------------------------------------------------------------------------- /tests/input/mapmygenome.txt: -------------------------------------------------------------------------------- 1 | SNP Name rsID Sample.ID Allele1...Top Allele2...Top GC.Score Sample.Name Sample.Group Sample.Index SNP.Index SNP.Aux Allele1...Forward Allele2...Forward Allele1...Design Allele2...Design Allele1...AB Allele2...AB Allele1...Plus Allele2...Plus Chr Position GT.Score Cluster.Sep SNP ILMN.Strand Customer.Strand Top.Genomic.Sequence Plus.Minus.Strand Theta R X Y X.Raw Y.Raw B.Allele.Freq Log.R.Ratio CNV.Value CNV.Confidence 2 | rs1 rs1 0 A A 0 NA NA 0 0 0 A A A A A A A A 1 101 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA 3 | rs2 rs2 0 C C 0 NA NA 0 0 0 A A A A A A C C 1 102 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA 4 | rs3 rs3 0 G G 0 NA NA 0 0 0 A A A A A A G G 1 103 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA 5 | rs4 rs4 0 T T 0 NA NA 0 0 0 A A A A A A T T 1 104 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA 6 | rs5 rs5 0 -- -- 0 NA NA 0 0 0 A A A A A A -- -- 1 105 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA 7 | rs6 rs6 0 G C 0 NA NA 0 0 0 A A A A A A G C 1 106 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA 8 | rs7 rs7 0 T C 0 NA NA 0 0 0 A A A A A A T C 1 107 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA 9 | rs8 rs8 0 A T 0 NA NA 0 0 0 A A A A A A A T 1 108 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA 10 | -------------------------------------------------------------------------------- /src/Dna.php: -------------------------------------------------------------------------------- 1 | 9 | * @copyright Copyright (c) 2020-2023, Liberu Software Ltd 10 | * @license MIT 11 | * 12 | * @link http://github.com/laravel-liberu/php-dna 13 | */ 14 | 15 | namespace Dna; 16 | 17 | /** 18 | * Class Dna. 19 | */ 20 | class Dna 21 | { 22 | /** 23 | * The directory where output files will be written. 24 | * 25 | * @var string 26 | */ 27 | protected string $_outputDir; 28 | 29 | /** 30 | * The directory containing resource files used for DNA analysis. 31 | * 32 | * @var string 33 | */ 34 | protected string $_resourcesDir; 35 | 36 | /** 37 | * Provides access to DNA resource files. 38 | * 39 | * @var \Dna\Resources 40 | */ 41 | protected Resources $_resources; 42 | 43 | public function __construct( 44 | string $outputDirectory = 'output', 45 | string $resourcesDirectory = 'resources' 46 | ) { 47 | $this->_outputDir = $outputDirectory; 48 | $this->_resourcesDir = $resourcesDirectory; 49 | $this->_resources = Resources::getInstance(); 50 | } 51 | } -------------------------------------------------------------------------------- /src/Snps/IO/SnpFileReader.php: -------------------------------------------------------------------------------- 1 | resources = $resources; 16 | $this->ensemblRestClient = $ensemblRestClient; 17 | } 18 | 19 | public function readRawData(string $file, bool $only_detect_source = false, array $rsids = []): array 20 | { 21 | $reader = new Reader($file, $only_detect_source, $this->resources, $rsids); 22 | $data = $reader->read(); 23 | 24 | return [ 25 | 'snps' => $data['snps'], 26 | 'source' => $data['source'], 27 | 'phased' => $data['phased'], 28 | 'build' => $data['build'], 29 | ]; 30 | } 31 | 32 | public function readFile(string $file): array 33 | { 34 | $data = $this->readRawData($file); 35 | 36 | if (!empty($data)) { 37 | // Further processing of the data if necessary 38 | // For example, sorting, deduplication, etc. 39 | } 40 | 41 | return $data; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/Snps/Analysis/BuildDetector.php: -------------------------------------------------------------------------------- 1 | [36 => 742429, 37 => 752566, 38 => 817186], 11 | "rs11928389" => [36 => 50908372, 37 => 50927009, 38 => 50889578], 12 | "rs2500347" => [36 => 143649677, 37 => 144938320, 38 => 148946169], 13 | "rs964481" => [36 => 27566744, 37 => 27656823, 38 => 27638706], 14 | "rs2341354" => [36 => 908436, 37 => 918573, 38 => 983193], 15 | "rs3850290" => [36 => 22315141, 37 => 23245301, 38 => 22776092], 16 | "rs1329546" => [36 => 135302086, 37 => 135474420, 38 => 136392261], 17 | ]; 18 | 19 | foreach ($snps as $snp) { 20 | foreach ($buildPositions as $rsid => $positions) { 21 | if ($snp['rsid'] === $rsid) { 22 | foreach ($positions as $build => $position) { 23 | if ($snp['pos'] === $position) { 24 | return $build; 25 | } 26 | } 27 | } 28 | } 29 | } 30 | 31 | return 0; // Default or unable to detect 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/Snps/IO/PythonDependency.php: -------------------------------------------------------------------------------- 1 | client = new Client(); 38 | } 39 | 40 | public function fetchData(string $url): ?array 41 | { 42 | try { 43 | $response = $this->client->request('GET', $url); 44 | if ($response->getStatusCode() === 200) { 45 | return json_decode($response->getBody()->getContents(), true); 46 | } 47 | return null; 48 | } catch (Exception $e) { 49 | error_log("Failed to fetch data from {$url}: " . $e->getMessage()); 50 | return null; 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/Snps/Analysis/ClusterOverlapCalculator.php: -------------------------------------------------------------------------------- 1 | $clusterData) { 11 | $snpsInCluster = array_filter($snps, function ($snp) use ($clusterData) { 12 | return in_array($snp['chrom'], $clusterData['chromosomes']) && $snp['pos'] >= $clusterData['start'] && $snp['pos'] <= $clusterData['end']; 13 | }); 14 | 15 | $snpsInCommon = count($snpsInCluster); 16 | $totalSnpsInCluster = count($clusterData['snps']); 17 | $overlapWithCluster = $snpsInCommon / $totalSnpsInCluster; 18 | $overlapWithSelf = $snpsInCommon / count($snps); 19 | 20 | if ($overlapWithCluster > $clusterOverlapThreshold && $overlapWithSelf > $clusterOverlapThreshold) { 21 | $overlapResults[$clusterId] = [ 22 | 'overlapWithCluster' => $overlapWithCluster, 23 | 'overlapWithSelf' => $overlapWithSelf, 24 | 'snpsInCommon' => $snpsInCommon, 25 | 'totalSnpsInCluster' => $totalSnpsInCluster, 26 | ]; 27 | } 28 | } 29 | 30 | return $overlapResults; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/Snps/IO/DataParser.php: -------------------------------------------------------------------------------- 1 | namespace Dna\Snps\IO; 2 | 3 | class DataParser 4 | { 5 | public function __construct() 6 | { 7 | } 8 | 9 | public function parseFile($filePath) 10 | { 11 | $format = $this->detectFileFormat($filePath); 12 | switch ($format) { 13 | case '23andMe': 14 | return $this->parse23andMe($filePath); 15 | case 'AncestryDNA': 16 | return $this->parseAncestryDNA($filePath); 17 | case 'GSA': 18 | return $this->parseGSA($filePath); 19 | default: 20 | return $this->parseGeneric($filePath); 21 | } 22 | } 23 | 24 | private function detectFileFormat($filePath) 25 | { 26 | // Logic to detect file format based on file content or metadata 27 | } 28 | 29 | private function parse23andMe($filePath) 30 | { 31 | // Parsing logic for 23andMe files 32 | } 33 | 34 | private function parseAncestryDNA($filePath) 35 | { 36 | // Parsing logic for AncestryDNA files 37 | } 38 | 39 | private function parseGSA($filePath) 40 | { 41 | // Parsing logic for Illumina Global Screening Array files 42 | } 43 | 44 | private function parseGeneric($filePath) 45 | { 46 | // Parsing logic for generic CSV/TSV files 47 | } 48 | 49 | private function extractComments($filePath) 50 | { 51 | // Utility method to extract comments from files 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/Snps/SNPData.php: -------------------------------------------------------------------------------- 1 | setSnps($snps); 13 | } 14 | 15 | public function getSnps(): array 16 | { 17 | return $this->snps; 18 | } 19 | 20 | public function setSnps(array $snps): void 21 | { 22 | $this->snps = $snps; 23 | $this->keys = array_keys($snps); 24 | } 25 | 26 | public function count(): int 27 | { 28 | return count($this->snps); 29 | } 30 | 31 | public function filter(callable $callback): array 32 | { 33 | return array_filter($this->snps, $callback); 34 | } 35 | 36 | public function sort(): void 37 | { 38 | ksort($this->snps); 39 | $this->keys = array_keys($this->snps); 40 | } 41 | 42 | public function merge(SNPData $other): void 43 | { 44 | $this->snps = array_merge($this->snps, $other->getSnps()); 45 | $this->keys = array_keys($this->snps); 46 | } 47 | 48 | public function getChromosomes(): array 49 | { 50 | return array_unique(array_column($this->snps, 'chrom')); 51 | } 52 | 53 | public function getSnpsByChromosome(string $chromosome): array 54 | { 55 | return array_filter($this->snps, function($snp) use ($chromosome) { 56 | return $snp['chrom'] === $chromosome; 57 | }); 58 | } 59 | } -------------------------------------------------------------------------------- /sweep.yaml: -------------------------------------------------------------------------------- 1 | # Sweep AI turns bugs & feature requests into code changes (https://sweep.dev) 2 | # For details on our config file, check out our docs at https://docs.sweep.dev/usage/config 3 | 4 | # This setting contains a list of rules that Sweep will check for. If any of these rules are broken in a new commit, Sweep will create an pull request to fix the broken rule. 5 | rules: 6 | - "All new business logic should have corresponding unit tests." 7 | - "Refactor large functions to be more modular." 8 | - "Add docstrings to all functions and file headers." 9 | 10 | # This is the branch that Sweep will develop from and make pull requests to. Most people use 'main' or 'master' but some users also use 'dev' or 'staging'. 11 | branch: 'main' 12 | 13 | # By default Sweep will read the logs and outputs from your existing Github Actions. To disable this, set this to false. 14 | gha_enabled: True 15 | 16 | # This is the description of your project. It will be used by sweep when creating PRs. You can tell Sweep what's unique about your project, what frameworks you use, or anything else you want. 17 | # 18 | # Example: 19 | # 20 | # description: sweepai/sweep is a python project. The main api endpoints are in sweepai/api.py. Write code that adheres to PEP8. 21 | description: '' 22 | 23 | # This sets whether to create pull requests as drafts. If this is set to True, then all pull requests will be created as drafts and GitHub Actions will not be triggered. 24 | draft: False 25 | 26 | # This is a list of directories that Sweep will not be able to edit. 27 | blocked_dirs: [] 28 | -------------------------------------------------------------------------------- /src/Snps/Resources.php: -------------------------------------------------------------------------------- 1 | baseUrl = $baseUrl; 16 | $this->localResourceDir = $localResourceDir; 17 | $this->httpClient = new Client(); 18 | } 19 | 20 | public function downloadResource(string $url, string $destinationPath): void 21 | { 22 | $response = $this->httpClient->get($url); 23 | file_put_contents($destinationPath, $response->getBody()); 24 | } 25 | 26 | public function loadDataFromFile(string $filePath) 27 | { 28 | return file_get_contents($filePath); 29 | } 30 | 31 | public function getReferenceSequence(string $id): ReferenceSequence 32 | { 33 | $filePath = $this->getLocalPathForResource($id); 34 | $sequenceData = $this->loadDataFromFile($filePath); 35 | return new ReferenceSequence($id, $sequenceData); 36 | } 37 | 38 | public function getAssemblyMappingData(string $id) 39 | { 40 | // Implementation for fetching assembly mapping data 41 | } 42 | 43 | public function getExampleDataset(string $id) 44 | { 45 | // Implementation for fetching example datasets 46 | } 47 | 48 | private function getLocalPathForResource(string $resourceId): string 49 | { 50 | return $this->localResourceDir . DIRECTORY_SEPARATOR . $resourceId; 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/Snps/SortTest.php: -------------------------------------------------------------------------------- 1 | valid()) { 28 | for ($x = 0; $x < $cols; $x++) { 29 | $key = $iter->key(); 30 | $obj = $iter->current(); 31 | $out .= sprintf($patt, $key, $obj->name, $obj->id); 32 | $iter->next(); 33 | if (!$iter->valid()) break; 34 | } 35 | } 36 | return $out . "\n"; 37 | } 38 | /** 39 | * Builds consistent array of Test instances 40 | * 41 | * @return array $arr 42 | */ 43 | public static function build() 44 | { 45 | $arr = []; 46 | $maxNames = count(self::$names); 47 | $pos = 0; 48 | for ($x = 0; $x < self::$max; $x++) { 49 | // note that the ID value == the order assigned 50 | $key = strtoupper(dechex($x)); 51 | $id = sprintf('%04d', $x + 1000); 52 | if ($pos >= $maxNames) $pos = 0; 53 | $name = self::$names[$pos++]; 54 | $test = new self(); 55 | $test->id = $id; 56 | $test->name = $name; 57 | $arr[$key] = $test; 58 | } 59 | return $arr; 60 | } 61 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # php-dna 2 | 3 | ## Running MatchKits from the Command Line 4 | 5 | To run the MatchKits script from the command line, navigate to the root directory of the php-dna project. 6 | 7 | Ensure you have PHP installed on your system. You can check this by running `php -v` in your command line. If PHP is not installed, please install it from the official PHP website. 8 | 9 | Execute the script by running the following command: `php src/MatchKits.php`. 10 | 11 | The script will prompt you to enter the file paths for Kit 1 and Kit 2. Please enter the full path for each file when prompted. 12 | 13 | After entering the file paths, the script will process the data and generate a matched data visualization. The output file named 'matched_data.png' will be saved in the root directory. 14 | 15 | ## Requirements 16 | 17 | * php-dna 1.0+ requires PHP 8.3 (or later). 18 | * php-dna 2.0+ requires PHP 8.4 (or later). 19 | 20 | ## Installation 21 | 22 | There are two ways of installing php-dna. 23 | 24 | ### Composer 25 | 26 | To install php-dna in your project using composer, simply add the following require line to your project's `composer.json` file: 27 | 28 | { 29 | "require": { 30 | "liberu-genealogy/php-dna": "1.0.*" 31 | } 32 | } 33 | 34 | ### Download and __autoload 35 | 36 | If you are not using composer, you can download an archive of the source from GitHub and extract it into your project. You'll need to setup an autoloader for the files, unless you go through the painstaking process if requiring all the needed files one-by-one. Something like the following should suffice: 37 | 38 | ```php 39 | spl_autoload_register(function ($class) { 40 | $pathToDna = __DIR__ . '/library/'; // TODO FIXME 41 | 42 | if (!substr(ltrim($class, '\\'), 0, 7) == 'Dna\\') { 43 | return; 44 | } 45 | 46 | $class = str_replace('\\', DIRECTORY_SEPARATOR, $class) . '.php'; 47 | if (file_exists($pathToDna . $class)) { 48 | require_once($pathToDna . $class); 49 | } 50 | }); 51 | ``` 52 | -------------------------------------------------------------------------------- /src/Snps/ReferenceSequence.php: -------------------------------------------------------------------------------- 1 | id; 27 | } 28 | 29 | /** 30 | * Get the URL 31 | */ 32 | public function getUrl(): string 33 | { 34 | return $this->url; 35 | } 36 | 37 | /** 38 | * Get the file path 39 | */ 40 | public function getPath(): string 41 | { 42 | return $this->path; 43 | } 44 | 45 | /** 46 | * Get the assembly 47 | */ 48 | public function getAssembly(): string 49 | { 50 | return $this->assembly; 51 | } 52 | 53 | /** 54 | * Get the species 55 | */ 56 | public function getSpecies(): string 57 | { 58 | return $this->species; 59 | } 60 | 61 | /** 62 | * Get the taxonomy 63 | */ 64 | public function getTaxonomy(): string 65 | { 66 | return $this->taxonomy; 67 | } 68 | 69 | /** 70 | * Check if the reference sequence file exists 71 | */ 72 | public function exists(): bool 73 | { 74 | return file_exists($this->path); 75 | } 76 | 77 | /** 78 | * Get the size of the reference sequence file 79 | */ 80 | public function getSize(): int 81 | { 82 | return $this->exists() ? filesize($this->path) : 0; 83 | } 84 | 85 | /** 86 | * String representation 87 | */ 88 | public function __toString(): string 89 | { 90 | return "ReferenceSequence(id='{$this->id}', assembly='{$this->assembly}', species='{$this->species}')"; 91 | } 92 | } -------------------------------------------------------------------------------- /src/Snps/SNPAnalyzer.php: -------------------------------------------------------------------------------- 1 | buildDetector = $buildDetector; 18 | $this->clusterOverlapCalculator = $clusterOverlapCalculator; 19 | } 20 | 21 | public function detectBuild(SNPData $snpData): int 22 | { 23 | return $this->buildDetector->detectBuild($snpData->getSnps()); 24 | } 25 | 26 | public function computeClusterOverlap(SNPData $snpData, float $threshold = 0.95): array 27 | { 28 | return $this->clusterOverlapCalculator->computeClusterOverlap($snpData->getSnps(), $threshold); 29 | } 30 | 31 | public function determineSex(SNPData $snpData): string 32 | { 33 | $xSnps = $snpData->getSnpsByChromosome('X'); 34 | $ySnps = $snpData->getSnpsByChromosome('Y'); 35 | 36 | if (empty($xSnps) && empty($ySnps)) { 37 | return ''; 38 | } 39 | 40 | $xHeterozygous = $this->countHeterozygous($xSnps); 41 | $yNonNull = $this->countNonNull($ySnps); 42 | 43 | if ($xHeterozygous / count($xSnps) > 0.03) { 44 | return 'Female'; 45 | } elseif ($yNonNull / count($ySnps) > 0.3) { 46 | return 'Male'; 47 | } 48 | 49 | return ''; 50 | } 51 | 52 | private function countHeterozygous(array $snps): int 53 | { 54 | return count(array_filter($snps, function($snp) { 55 | return strlen($snp['genotype']) === 2 && $snp['genotype'][0] !== $snp['genotype'][1]; 56 | })); 57 | } 58 | 59 | private function countNonNull(array $snps): int 60 | { 61 | return count(array_filter($snps, function($snp) { 62 | return $snp['genotype'] !== null; 63 | })); 64 | } 65 | } -------------------------------------------------------------------------------- /src/Snps/IO/CsvReader.php: -------------------------------------------------------------------------------- 1 | filePath = $filePath; 16 | $this->separator = ','; 17 | $this->header = false; 18 | $this->columnNames = []; 19 | $this->columnTypes = []; 20 | } 21 | 22 | public function setSeparator($separator) 23 | { 24 | $this->separator = $separator; 25 | } 26 | 27 | public function setHeader($header) 28 | { 29 | $this->header = $header; 30 | } 31 | 32 | public function setColumnNames($columnNames) 33 | { 34 | $this->columnNames = $columnNames; 35 | } 36 | 37 | public function setColumnTypes($columnTypes) 38 | private $enclosure; 39 | 40 | public function setEnclosure($enclosure) 41 | { 42 | $this->enclosure = $enclosure; 43 | } 44 | { 45 | $this->columnTypes = $columnTypes; 46 | } 47 | 48 | public function read() 49 | { 50 | $data = []; 51 | 52 | if (($handle = fopen($this->filePath, "r")) !== false) { 53 | if ($this->header) { 54 | fgetcsv($handle, 0, $this->separator, $this->enclosure); // Skip the header row 55 | } 56 | 57 | while (($row = fgetcsv($handle, 0, $this->separator, $this->enclosure)) !== false) { 58 | if (!empty($this->columnNames)) { 59 | print_r($row); 60 | $row = array_combine($this->columnNames, $row); 61 | } 62 | 63 | if (!empty($this->columnTypes)) { 64 | foreach ($this->columnTypes as $column => $type) { 65 | settype($row[$column], $type); 66 | } 67 | } 68 | 69 | $data[] = $row; 70 | } 71 | 72 | fclose($handle); 73 | } 74 | 75 | return $data; 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/Snps/Ensembl.php: -------------------------------------------------------------------------------- 1 | last_req = microtime(true); 23 | } 24 | 25 | public function performRestAction( 26 | string $endpoint, 27 | array $headers = [], 28 | array $params = [] 29 | ): ?array { 30 | $headers['Content-Type'] ??= 'application/json'; 31 | 32 | $this->rateLimit(); 33 | 34 | try { 35 | $response = $this->makeRequest($endpoint, $headers, $params); 36 | return $this->handleResponse($response); 37 | } catch (TransportExceptionInterface $e) { 38 | error_log("Request failed for {$endpoint}: " . $e->getMessage()); 39 | return null; 40 | } 41 | } 42 | 43 | private function makeRequest(string $endpoint, array $headers, array $params): ResponseInterface 44 | { 45 | $client = HttpClient::create(); 46 | return $client->request('GET', "{$this->server}{$endpoint}", [ 47 | 'headers' => $headers, 48 | 'query' => $params, 49 | ]); 50 | } 51 | 52 | private function rateLimit(): void 53 | { 54 | if ($this->req_count >= $this->reqs_per_sec) { 55 | $delta = microtime(true) - $this->last_req; 56 | if ($delta < 1) { 57 | usleep((1 - $delta) * 1000000); 58 | } 59 | $this->last_req = microtime(true); 60 | $this->req_count = 0; 61 | } else { 62 | $this->req_count++; 63 | } 64 | } 65 | } 66 | ?> 67 | -------------------------------------------------------------------------------- /src/Snps/ReferenceSequenceManager.php: -------------------------------------------------------------------------------- 1 | init_resource_attributes(); 15 | } 16 | 17 | private function init_resource_attributes(): void 18 | { 19 | $this->_reference_sequences = []; 20 | } 21 | 22 | public function getReferenceSequences(string $assembly = "GRCh37", array $chroms = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "MT"]): array 23 | { 24 | if (!in_array($assembly, $this->validAssemblies)) { 25 | error_log("Invalid assembly"); 26 | return []; 27 | } 28 | 29 | if (!$this->referenceChromsAvailable($assembly, $chroms)) { 30 | // Placeholder for logic to fetch paths and URLs for reference sequences 31 | $urls = []; 32 | $paths = []; 33 | $this->_reference_sequences[$assembly] = $this->createReferenceSequences($assembly, $chroms, $urls, $paths); 34 | } 35 | 36 | return $this->_reference_sequences[$assembly]; 37 | } 38 | 39 | private function referenceChromsAvailable(string $assembly, array $chroms): bool 40 | { 41 | // Placeholder for actual availability check logic 42 | return false; 43 | } 44 | 45 | protected function createReferenceSequences(string $assembly, array $chroms, array $urls, array $paths): array 46 | { 47 | $seqs = []; 48 | 49 | foreach ($paths as $i => $path) { 50 | if (!$path) { 51 | continue; 52 | } 53 | 54 | $seqs[$chroms[$i]] = new ReferenceSequence( 55 | $chroms[$i], 56 | $urls[$i], 57 | realpath($path), 58 | $assembly, 59 | "Homo sapiens", 60 | "x" 61 | ); 62 | } 63 | 64 | return $seqs; 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /tests/input/discrepant_snps.csv: -------------------------------------------------------------------------------- 1 | rsid,chromosome,position_file1,position_file2,genotype_file1,genotype_file2,discrepant_position,discrepant_genotype,expected_position,expected_genotype 2 | rs1,1,1,1,--,--,False,False,1,-- 3 | rs2,1,2,2,--,AA,False,False,2,AA 4 | rs3,1,3,3,AA,--,False,False,3,AA 5 | rs4,1,4,4,AA,AA,False,False,4,AA 6 | rs5,1,5,5,--,--,False,False,5,-- 7 | rs6,1,6,6,--,AB,False,False,6,AB 8 | rs7,1,7,7,AB,--,False,False,7,AB 9 | rs8,1,8,8,AB,AB,False,False,8,AB 10 | rs9,1,9,9,AB,BA,False,False,9,AB 11 | rs10,1,10,10,BA,AB,False,False,10,BA 12 | rs11,1,11,11,BA,BA,False,False,11,BA 13 | rs12,1,12,12,AB,AC,False,True,12,-- 14 | rs13,1,13,13,AB,CA,False,True,13,-- 15 | rs14,1,14,14,BA,AC,False,True,14,-- 16 | rs15,1,15,15,BA,CA,False,True,15,-- 17 | rs16,1,16,16,AB,CD,False,True,16,-- 18 | rs17,1,17,17,AB,DC,False,True,17,-- 19 | rs18,1,18,18,BA,CD,False,True,18,-- 20 | rs19,1,19,19,BA,DC,False,True,19,-- 21 | rs20,MT,20,20,--,--,False,False,20,-- 22 | rs21,MT,21,21,--,A,False,False,21,A 23 | rs22,MT,22,22,A,--,False,False,22,A 24 | rs23,MT,23,23,A,A,False,False,23,A 25 | rs24,MT,24,24,A,B,False,True,24,-- 26 | rs25,MT,25,25,B,A,False,True,25,-- 27 | rs26,MT,26,26,B,B,False,False,26,B 28 | rs27,1,27,1,--,--,True,False,27,-- 29 | rs28,1,28,2,--,AA,True,False,28,AA 30 | rs29,1,29,3,AA,--,True,False,29,AA 31 | rs30,1,30,4,AA,AA,True,False,30,AA 32 | rs31,1,31,5,--,--,True,False,31,-- 33 | rs32,1,32,6,--,AB,True,False,32,AB 34 | rs33,1,33,7,AB,--,True,False,33,AB 35 | rs34,1,34,8,AB,AB,True,False,34,AB 36 | rs35,1,35,9,AB,BA,True,False,35,AB 37 | rs36,1,36,10,BA,AB,True,False,36,BA 38 | rs37,1,37,11,BA,BA,True,False,37,BA 39 | rs38,1,38,12,AB,AC,True,True,38,-- 40 | rs39,1,39,13,AB,CA,True,True,39,-- 41 | rs40,1,40,14,BA,AC,True,True,40,-- 42 | rs41,1,41,15,BA,CA,True,True,41,-- 43 | rs42,1,42,16,AB,CD,True,True,42,-- 44 | rs43,1,43,17,AB,DC,True,True,43,-- 45 | rs44,1,44,18,BA,CD,True,True,44,-- 46 | rs45,1,45,19,BA,DC,True,True,45,-- 47 | rs46,MT,46,20,--,--,True,False,46,-- 48 | rs47,MT,47,21,--,A,True,False,47,A 49 | rs48,MT,48,22,A,--,True,False,48,A 50 | rs49,MT,49,23,A,A,True,False,49,A 51 | rs50,MT,50,24,A,B,True,True,50,-- 52 | rs51,MT,51,25,B,A,True,True,51,-- 53 | rs52,MT,52,26,B,B,True,False,52,B 54 | -------------------------------------------------------------------------------- /src/Snps/DatasetDownloader.php: -------------------------------------------------------------------------------- 1 | declare(strict_types=1); 2 | 3 | namespace Dna\Snps; 4 | 5 | use GuzzleHttp\Client; 6 | use GuzzleHttp\Exception\GuzzleException; 7 | use League\Csv\Reader; 8 | 9 | final class DatasetDownloader 10 | { 11 | public function __construct( 12 | private readonly Client $httpClient = new Client(), 13 | private readonly string $cacheDir = __DIR__ . '/../../cache' 14 | ) { 15 | if (!is_dir($this->cacheDir)) { 16 | mkdir($this->cacheDir, 0755, true); 17 | } 18 | } 19 | 20 | /** 21 | * @return array 22 | * @throws GuzzleException 23 | */ 24 | public function downloadExampleDatasets(): array 25 | { 26 | return [ 27 | $this->downloadFile("https://opensnp.org/data/662.23andme.340", "662.23andme.340.txt.gz"), 28 | $this->downloadFile("https://opensnp.org/data/662.ftdna-illumina.341", "662.ftdna-illumina.341.csv.gz") 29 | ]; 30 | } 31 | 32 | public function getAllResources() 33 | { 34 | $resources = []; 35 | $resources["gsa_resources"] = $this->getGsaResources(); 36 | $resources["chip_clusters"] = $this->get_chip_clusters(); 37 | $resources["low_quality_snps"] = $this->getLowQualitySNPs(); 38 | return $resources; 39 | } 40 | 41 | public function getGsaResources(): array 42 | { 43 | // Implementation similar to SNPsResources::getGsaResources 44 | } 45 | 46 | public function get_chip_clusters() 47 | { 48 | // Implementation similar to SNPsResources::get_chip_clusters 49 | } 50 | 51 | public function getLowQualitySNPs(): array 52 | { 53 | // Implementation similar to SNPsResources::getLowQualitySNPs 54 | } 55 | 56 | public function get_dbsnp_151_37_reverse(): ?array 57 | { 58 | // Implementation similar to SNPsResources::get_dbsnp_151_37_reverse 59 | } 60 | 61 | public function getOpensnpDatadumpFilenames(): array 62 | { 63 | // Implementation similar to SNPsResources::getOpensnpDatadumpFilenames 64 | } 65 | 66 | private function download_file(string $url, string $filename, bool $compress = false): string 67 | { 68 | // Implementation similar to SNPsResources::download_file 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/Snps/DocBlockChecker.php: -------------------------------------------------------------------------------- 1 | target = $target; 15 | $this->reflect = new ReflectionClass($target); 16 | } 17 | /** 18 | * Check methods for docBlocks 19 | * If one exists but no @param, adds them 20 | * If no docBlock, creates one 21 | * 22 | * @return array $methods : method name => docBlock 23 | */ 24 | public function check() 25 | { 26 | // get methods 27 | $methods = []; 28 | $list = $this->reflect->getMethods(); 29 | foreach ($list as $refMeth) { 30 | // get docbock 31 | $docBlock = $refMeth->getDocComment(); 32 | if (!$docBlock) { 33 | $docBlock = "/**\n * " . $refMeth->getName() . "\n"; 34 | // get params 35 | $params = $refMeth->getParameters(); 36 | if ($params) { 37 | foreach ($params as $refParm) { 38 | $type = $refParm->getType() ?? 'mixed'; 39 | $type = (string) $type; 40 | $name = $refParm->getName(); 41 | $default = ''; 42 | if (!$refParm->isVariadic() && $refParm->isOptional()) 43 | $default = $refParm->getDefaultValue(); 44 | if ($default === '') $default = "(empty string)"; 45 | $docBlock .= " * @param $type \${$name} : $default\n"; 46 | } 47 | } 48 | // get return type 49 | if ($refMeth->isConstructor()) 50 | $return = 'void'; 51 | else 52 | $return = $refMeth->getReturnType() ?? 'mixed'; 53 | $docBlock .= " * @return $return\n"; 54 | $docBlock .= " */\n"; 55 | } 56 | $methods[$refMeth->getName()] = $docBlock; 57 | } 58 | return $methods; 59 | } 60 | } -------------------------------------------------------------------------------- /src/Snps/IO/PhpDataFrame.php: -------------------------------------------------------------------------------- 1 | columns = array_keys($data[0]); 14 | $this->data = $data; 15 | } 16 | } 17 | 18 | public static function fromFile(string $filePath): self 19 | { 20 | $rows = array_map('str_getcsv', file($filePath)); 21 | $columns = array_shift($rows); 22 | $data = array_map(fn($row) => array_combine($columns, $row), $rows); 23 | return new self($data); 24 | } 25 | 26 | public function addRow(array $row): void 27 | { 28 | $this->data[] = $row; 29 | } 30 | 31 | public function removeRow(int $index): void 32 | { 33 | array_splice($this->data, $index, 1); 34 | } 35 | 36 | public function addColumn(string $columnName, array $values): void 37 | { 38 | foreach ($this->data as $index => $row) { 39 | $this->data[$index][$columnName] = $values[$index] ?? null; 40 | } 41 | if (!in_array($columnName, $this->columns)) { 42 | $this->columns[] = $columnName; 43 | } 44 | } 45 | 46 | public function removeColumn(string $columnName): void 47 | { 48 | foreach ($this->data as $index => $row) { 49 | unset($this->data[$index][$columnName]); 50 | } 51 | $this->columns = array_filter($this->columns, fn($column) => $column !== $columnName); 52 | } 53 | 54 | public function filter(callable $callback): self 55 | { 56 | $filteredData = array_filter($this->data, $callback); 57 | return new self(array_values($filteredData)); 58 | } 59 | 60 | public function sum(string $columnName): float 61 | { 62 | return array_sum(array_column($this->data, $columnName)); 63 | } 64 | 65 | public function average(string $columnName): float 66 | { 67 | $columnData = array_column($this->data, $columnName); 68 | return array_sum($columnData) / count($columnData); 69 | } 70 | 71 | public function toArray(): array 72 | { 73 | return $this->data; 74 | } 75 | 76 | public function getRow(int $index): ?array 77 | { 78 | return $this->data[$index] ?? null; 79 | } 80 | 81 | public function getColumn(string $columnName): array 82 | { 83 | return array_column($this->data, $columnName); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/Individual.php: -------------------------------------------------------------------------------- 1 | declare(strict_types=1); 2 | 3 | namespace Dna; 4 | 5 | use Dna\Snps\SNPs; 6 | use ReflectionMethod; 7 | 8 | final class Individual extends SNPs 9 | { 10 | public function __construct( 11 | private readonly string $name, 12 | private readonly mixed $rawData = [], 13 | private readonly array $kwargs = [] 14 | ) { 15 | $snpsConstructorArgs = $this->getDefinedKwargs( 16 | new ReflectionMethod(SNPs::class, '__construct'), 17 | $kwargs 18 | ); 19 | 20 | parent::__construct(...array_values($snpsConstructorArgs)); 21 | 22 | $this->processRawData(); 23 | } 24 | 25 | private function processRawData(): void 26 | { 27 | $rawDataArray = is_array($this->rawData) ? $this->rawData : [$this->rawData]; 28 | 29 | foreach ($rawDataArray as $data) { 30 | $snps = $this->createSnpsObject($data); 31 | $this->merge([$snps]); 32 | } 33 | } 34 | 35 | /** 36 | * Get the string representation of the Individual 37 | * 38 | * @return string The string representation 39 | */ 40 | public function __toString(): string 41 | { 42 | return sprintf("Individual('%s')", $this->name); 43 | } 44 | 45 | /** 46 | * Get the Individual's name 47 | * 48 | * @return string The name 49 | */ 50 | public function getName(): string 51 | { 52 | return $this->name; 53 | } 54 | 55 | /** 56 | * Get a variable-safe version of the Individual's name 57 | * 58 | * @return string The variable-safe name 59 | */ 60 | public function getVarName(): string 61 | { 62 | return $this->clean_str($this->name); 63 | } 64 | 65 | /** 66 | * Clean a string to make it variable-safe 67 | * 68 | * @param string $str The string to clean 69 | * @return string The cleaned string 70 | */ 71 | private function clean_str(string $str): string 72 | { 73 | // Remove special characters and replace with underscores 74 | $cleaned = preg_replace('/[^a-zA-Z0-9_]/', '_', $str); 75 | 76 | // Remove multiple consecutive underscores 77 | $cleaned = preg_replace('/_+/', '_', $cleaned); 78 | 79 | // Remove leading/trailing underscores 80 | $cleaned = trim($cleaned, '_'); 81 | 82 | // Ensure it doesn't start with a number 83 | if (is_numeric(substr($cleaned, 0, 1))) { 84 | $cleaned = 'var_' . $cleaned; 85 | } 86 | 87 | return $cleaned ?: 'unnamed'; 88 | } 89 | } -------------------------------------------------------------------------------- /tests/IndividualTest.php: -------------------------------------------------------------------------------- 1 | assertEquals('John Doe', $individual->getName()); 17 | $this->assertEquals("Individual('John Doe')", (string)$individual); 18 | } 19 | 20 | public function testVarNameGeneration(): void 21 | { 22 | $individual = new Individual('John Doe-Smith'); 23 | $varName = $individual->getVarName(); 24 | 25 | // Should convert special characters to underscores 26 | $this->assertEquals('John_Doe_Smith', $varName); 27 | } 28 | 29 | public function testVarNameWithNumbers(): void 30 | { 31 | $individual = new Individual('123Test'); 32 | $varName = $individual->getVarName(); 33 | 34 | // Should prefix with 'var_' if starts with number 35 | $this->assertEquals('var_123Test', $varName); 36 | } 37 | 38 | public function testVarNameWithSpecialCharacters(): void 39 | { 40 | $individual = new Individual('Test@#$%Name'); 41 | $varName = $individual->getVarName(); 42 | 43 | // Should replace special characters with underscores 44 | $this->assertEquals('Test_Name', $varName); 45 | } 46 | 47 | public function testVarNameEmpty(): void 48 | { 49 | $individual = new Individual(''); 50 | $varName = $individual->getVarName(); 51 | 52 | // Should return 'unnamed' for empty string 53 | $this->assertEquals('unnamed', $varName); 54 | } 55 | 56 | public function testVarNameOnlySpecialChars(): void 57 | { 58 | $individual = new Individual('@#$%'); 59 | $varName = $individual->getVarName(); 60 | 61 | // Should return 'unnamed' when only special characters 62 | $this->assertEquals('unnamed', $varName); 63 | } 64 | 65 | public function testIndividualWithRawData(): void 66 | { 67 | $rawData = [ 68 | 'rs123' => ['rsid' => 'rs123', 'chrom' => '1', 'pos' => 1000, 'genotype' => 'AA'], 69 | 'rs456' => ['rsid' => 'rs456', 'chrom' => '2', 'pos' => 2000, 'genotype' => 'AT'], 70 | ]; 71 | 72 | $individual = new Individual('Test Individual', $rawData); 73 | 74 | $this->assertEquals('Test Individual', $individual->getName()); 75 | $this->assertTrue($individual->isValid()); 76 | $this->assertEquals(2, $individual->count()); 77 | } 78 | } -------------------------------------------------------------------------------- /src/Snps/AssemblyMappingManager.php: -------------------------------------------------------------------------------- 1 | declare(strict_types=1); 2 | 3 | namespace Dna\Snps; 4 | 5 | use PharData; 6 | use GuzzleHttp\Client; 7 | use GuzzleHttp\Exception\GuzzleException; 8 | 9 | final class AssemblyMappingManager 10 | { 11 | public function __construct( 12 | private readonly Client $httpClient = new Client(), 13 | private readonly string $resourcePath = __DIR__ . "/resources" 14 | ) { 15 | if (!is_dir($this->resourcePath)) { 16 | mkdir($this->resourcePath, 0755, true); 17 | } 18 | } 19 | 20 | /** 21 | * @throws Exception 22 | */ 23 | public function getAssemblyMappingData(string $sourceAssembly, string $targetAssembly): string 24 | { 25 | $filename = "assembly_mapping_{$sourceAssembly}_to_{$targetAssembly}.tar.gz"; 26 | $filepath = "{$this->resourcePath}/{$filename}"; 27 | 28 | if (!file_exists($filepath)) { 29 | return $this->downloadMappingData($sourceAssembly, $targetAssembly, $filepath); 30 | } 31 | 32 | return $filepath; 33 | } 34 | 35 | public static function loadAssemblyMappingData(string $filename): array 36 | { 37 | $assemblyMappingData = []; 38 | try { 39 | $tar = new PharData($filename); 40 | foreach ($tar as $file) { 41 | if (strpos($file->getFilename(), '.json') !== false) { 42 | $content = file_get_contents($file->getPathname()); 43 | $data = json_decode($content, true); 44 | if (json_last_error() === JSON_ERROR_NONE) { 45 | $assemblyMappingData[] = $data; 46 | } else { 47 | throw new Exception("Error parsing JSON data."); 48 | } 49 | } 50 | } 51 | } catch (Exception $e) { 52 | throw new Exception("Error loading assembly mapping data: " . $e->getMessage()); 53 | } 54 | 55 | return $assemblyMappingData; 56 | } 57 | 58 | /** 59 | * @throws GuzzleException 60 | */ 61 | private function downloadMappingData(string $sourceAssembly, string $targetAssembly, string $filepath): void 62 | { 63 | $url = "http://example.com/assembly_mapping/{$sourceAssembly}/{$targetAssembly}"; 64 | $response = $this->httpClient->get($url); 65 | if ($response->getStatusCode() === 200) { 66 | file_put_contents($filepath, $response->getBody()->getContents()); 67 | } else { 68 | throw new GuzzleException("Failed to download assembly mapping data."); 69 | } 70 | } 71 | } -------------------------------------------------------------------------------- /phpconvcount.py: -------------------------------------------------------------------------------- 1 | import re 2 | import ast 3 | 4 | pycodefile = '../Projects/geneology/snps/tests/test_snps.py' 5 | phpcodefile = 'tests/Snps/SnpsTest.php' 6 | 7 | 8 | def normalize_function_name(name): 9 | # Check if the name is already in camelCase with mixed case 10 | if any(c.islower() and name[i+1:i+2].isupper() for i, c in enumerate(name[:-1])): 11 | return name 12 | # Handle snake_case to camelCase conversion 13 | name_parts = name.split('_') 14 | name = name_parts[0] + ''.join(word.strip().capitalize() for word in name_parts[1:]) 15 | return name 16 | 17 | def get_function_names_in_class(python_code, class_name): 18 | # Parse the Python code using the ast module 19 | parsed_code = ast.parse(python_code) 20 | 21 | # Initialize variables to track function names 22 | function_names = [] 23 | 24 | # Helper function to extract function names from a class node 25 | def extract_function_names(class_node): 26 | names = [] 27 | for node in ast.walk(class_node): 28 | if isinstance(node, ast.FunctionDef): 29 | names.append(node.name) 30 | return names 31 | 32 | # Traverse the parsed code and extract function names within the specified class 33 | for node in ast.walk(parsed_code): 34 | if isinstance(node, ast.ClassDef) and node.name == class_name: 35 | function_names.extend(extract_function_names(node)) 36 | 37 | # Return the list of function names 38 | return function_names 39 | 40 | 41 | # Step 1: Read Python Code from the File 42 | with open(pycodefile, 'r') as python_file: 43 | python_code = python_file.read() 44 | 45 | # Step 2: Extract Functions within the TestSnps Class 46 | # Extract function names from the TestSnps class 47 | python_functions = get_function_names_in_class(python_code, "TestSnps") 48 | 49 | # Step 3: Normalize Python Function Names 50 | normalized_python_functions = list(set(normalize_function_name(func) for func in python_functions)) 51 | 52 | # Step 4: Read PHP Code from the File 53 | with open(phpcodefile, 'r') as php_file: 54 | php_code = php_file.read() 55 | 56 | # Step 5: Extract PHP Function Names 57 | php_functions = re.findall(r'(public|private|protected) function ([a-zA-Z_][a-zA-Z0-9_]*)\(', php_code) 58 | 59 | php_functions = [name for (visibility, name) in php_functions] 60 | 61 | 62 | # Step 6: Normalize PHP Function Names 63 | normalized_php_functions = [normalize_function_name(func) for func in php_functions] 64 | 65 | # Step 7: Compare Python and PHP Function Names 66 | missing_functions = set(normalized_python_functions) - set(normalized_php_functions) 67 | extra_functions = set(normalized_php_functions) - set(normalized_python_functions) 68 | 69 | # Count of functions in Python and PHP 70 | python_function_count = len(normalized_python_functions) 71 | php_function_count = len(normalized_php_functions) 72 | 73 | # Print the count of functions 74 | print("Number of Functions in Python:", python_function_count) 75 | print("Number of Functions in PHP:", php_function_count) 76 | 77 | # print(normalized_python_functions) 78 | 79 | # Print missing functions in PHP compared to Python 80 | print("\nMissing Functions in PHP:") 81 | for func in missing_functions: 82 | print(func) 83 | 84 | 85 | print("\nExtra Functions in PHP:") 86 | for func in extra_functions: 87 | print(func) -------------------------------------------------------------------------------- /src/Triangulation.php: -------------------------------------------------------------------------------- 1 | getMessage()); 25 | } 26 | } 27 | 28 | /** 29 | * Validate input kits data 30 | * 31 | * @param array $kitsData Array of SNPs objects 32 | * @throws Exception If input is invalid 33 | */ 34 | private static function validateInput(array $kitsData): void { 35 | if (count($kitsData) < 3) { 36 | throw new Exception("At least three DNA kits are required for triangulation."); 37 | } 38 | foreach ($kitsData as $kit) { 39 | if (!$kit instanceof SNPs) { 40 | throw new Exception("Invalid input: All elements must be instances of SNPs class."); 41 | } 42 | } 43 | } 44 | 45 | /** 46 | * Extract SNP lists from kits data 47 | * 48 | * @param SNPs[] $kitsData Array of SNPs objects 49 | * @return array Array of SNP lists 50 | */ 51 | private static function extractSnpLists(array $kitsData): array { 52 | return array_map(function(SNPs $kit) { return $kit->getSnps(); }, $kitsData); 53 | } 54 | 55 | /** 56 | * Find common SNPs across all kits 57 | * 58 | * @param array $snpsLists Array of SNP lists 59 | * @return array Common SNPs 60 | */ 61 | private static function findCommonSnps(array $snpsLists): array { 62 | return call_user_func_array('array_intersect_key', $snpsLists); 63 | } 64 | 65 | /** 66 | * Filter non-common SNPs 67 | * 68 | * @param array $commonSnps Array of common SNPs 69 | * @param SNPs[] $kitsData Array of SNPs objects 70 | * @return array Filtered common SNPs 71 | */ 72 | private static function filterNonCommonSnps(array $commonSnps, array $kitsData): array { 73 | return array_filter($commonSnps, function($snp) use ($kitsData) { 74 | return self::isSnpCommonAcrossAllKits($snp, $kitsData); 75 | }); 76 | } 77 | 78 | /** 79 | * Check if SNP is common across all kits 80 | * 81 | * @param array $snp SNP to check 82 | * @param SNPs[] $kitsData Array of SNPs objects 83 | * @return bool True if SNP is common across all kits, false otherwise 84 | */ 85 | private static function isSnpCommonAcrossAllKits(array $snp, array $kitsData): bool { 86 | return count(array_filter($kitsData, function(SNPs $kit) use ($snp) { 87 | $snps = $kit->getSnps(); 88 | return isset($snps[$snp['pos']]) && $snps[$snp['pos']]['genotype'] === $snp['genotype']; 89 | })) === count($kitsData); 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/Snps/Utils.php: -------------------------------------------------------------------------------- 1 | 11 | * @copyright Copyright (c) 2020-2023, Devmanateam 12 | * @license MIT 13 | * 14 | * @link http://github.com/familytree365/php-dna 15 | */ 16 | 17 | namespace Dna\Snps; 18 | 19 | use ZipArchive; 20 | use Exception; 21 | 22 | /** 23 | * Utils class provides utility functions for file manipulation, parallel processing, 24 | * and other common tasks. It includes methods for gzipping files, creating directories, 25 | * fetching current UTC time, saving data as CSV, cleaning strings, and zipping files. 26 | */ 27 | final class Utils 28 | { 29 | public static function gzipFile(string $src, string $dest): string 30 | { 31 | /** 32 | * Gzip a file. 33 | * 34 | * @param string $src Path to file to gzip 35 | * @param string $dest Path to output gzip file 36 | * 37 | * @return string Path to gzipped file 38 | */ 39 | 40 | if (!is_readable($src)) { 41 | throw new Exception("Cannot read source file: {$src}"); 42 | } 43 | 44 | $srcFile = fopen($src, "rb"); 45 | $gzFile = gzopen($dest, "wb9"); // Maximum compression 46 | 47 | try { 48 | stream_copy_to_stream($srcFile, $gzFile); 49 | return $dest; 50 | } finally { 51 | fclose($srcFile); 52 | gzclose($gzFile); 53 | } 54 | } 55 | /** 56 | * Creates a directory if it doesn't exist. 57 | * 58 | * @param string $path Path to the directory to create. 59 | * @return void 60 | */ 61 | public static function createDir(string $path): void 62 | { 63 | if (!file_exists($path)) { 64 | mkdir($path, 0755, true); 65 | } 66 | } 67 | 68 | /** 69 | * Gets the current UTC time. 70 | * 71 | * @return string Current UTC time in 'Y-m-d H:i:s' format. 72 | */ 73 | public static function getUtcNow(): string 74 | { 75 | return (new \DateTimeImmutable('now', new \DateTimeZone('UTC'))) 76 | ->format('Y-m-d H:i:s'); 77 | } 78 | 79 | /** 80 | * Saves data as a CSV file. 81 | * 82 | * @param array $data Data to save. 83 | * @param string $filename Path to the CSV file. 84 | * @return void 85 | */ 86 | public static function saveDfAsCsv(array $data, string $filename): void 87 | { 88 | $fp = fopen($filename, 'w'); 89 | foreach ($data as $row) { 90 | fputcsv($fp, $row); 91 | } 92 | fclose($fp); 93 | } 94 | 95 | /** 96 | * Cleans a string to be used as a variable name. 97 | * 98 | * @param string $str String to clean. 99 | * @return string Cleaned string. 100 | */ 101 | public static function cleanStr(string $str): string 102 | { 103 | return preg_replace('/[^A-Za-z0-9_]/', '', $str); 104 | } 105 | 106 | /** 107 | * Zips a file. 108 | * 109 | * @param string $src Path to the file to zip. 110 | * @param string $dest Path to the output zip file. 111 | * @return void 112 | */ 113 | public static function zipFile(string $src, string $dest): void 114 | { 115 | $zip = new ZipArchive(); 116 | if ($zip->open($dest, ZipArchive::CREATE) === TRUE) { 117 | $zip->addFile($src, basename($src)); 118 | $zip->close(); 119 | } 120 | } 121 | } -------------------------------------------------------------------------------- /src/Snps/EnsemblRestClient.php: -------------------------------------------------------------------------------- 1 | server = $server; 25 | $this->reqsPerSec = $reqsPerSec; 26 | $this->client = new Client([ 27 | 'base_uri' => $server, 28 | 'timeout' => 30, 29 | 'headers' => [ 30 | 'Content-Type' => 'application/json', 31 | 'Accept' => 'application/json' 32 | ] 33 | ]); 34 | } 35 | 36 | /** 37 | * Perform a REST API action with rate limiting 38 | * 39 | * @param string $endpoint The API endpoint to call 40 | * @param array $params Query parameters 41 | * @return array|null The decoded JSON response or null on error 42 | */ 43 | public function perform_rest_action(string $endpoint, array $params = []): ?array 44 | { 45 | $this->rateLimit(); 46 | 47 | try { 48 | $response = $this->client->get($endpoint, [ 49 | 'query' => $params 50 | ]); 51 | 52 | if ($response->getStatusCode() === 200) { 53 | $body = $response->getBody()->getContents(); 54 | return json_decode($body, true); 55 | } 56 | } catch (GuzzleException $e) { 57 | error_log("REST API error: " . $e->getMessage()); 58 | } 59 | 60 | return null; 61 | } 62 | 63 | /** 64 | * Rate limiting to respect API limits 65 | */ 66 | private function rateLimit(): void 67 | { 68 | $currentTime = microtime(true); 69 | $timeSinceLastRequest = $currentTime - $this->lastRequestTime; 70 | $minInterval = 1.0 / $this->reqsPerSec; 71 | 72 | if ($timeSinceLastRequest < $minInterval) { 73 | $sleepTime = $minInterval - $timeSinceLastRequest; 74 | usleep((int)($sleepTime * 1000000)); 75 | } 76 | 77 | $this->lastRequestTime = microtime(true); 78 | } 79 | 80 | /** 81 | * Get assembly mapping data from Ensembl 82 | * 83 | * @param string $species Species name (e.g., 'human') 84 | * @param string $fromAssembly Source assembly 85 | * @param string $toAssembly Target assembly 86 | * @param string $region Genomic region 87 | * @return array|null Mapping data or null on error 88 | */ 89 | public function getAssemblyMapping( 90 | string $species, 91 | string $fromAssembly, 92 | string $toAssembly, 93 | string $region 94 | ): ?array { 95 | $endpoint = "/map/{$species}/{$fromAssembly}/{$region}/{$toAssembly}"; 96 | return $this->perform_rest_action($endpoint); 97 | } 98 | 99 | /** 100 | * Lookup RefSNP snapshot from NCBI 101 | * 102 | * @param string $rsid The rs ID (without 'rs' prefix) 103 | * @return array|null RefSNP data or null on error 104 | */ 105 | public function lookupRefsnpSnapshot(string $rsid): ?array 106 | { 107 | $id = str_replace("rs", "", $rsid); 108 | return $this->perform_rest_action("/variation/v0/refsnp/" . $id); 109 | } 110 | } -------------------------------------------------------------------------------- /src/Snps/PythonDependency.php: -------------------------------------------------------------------------------- 1 | data = $data; 14 | $this->columns = $columns; 15 | } 16 | 17 | public function filter(callable $callback) { 18 | $filteredData = array_filter($this->data, $callback); 19 | return new self($filteredData, $this->columns); 20 | } 21 | 22 | public function merge(DataFrame $other, string $joinType = 'inner', ?string $on = null) { 23 | // Implement the logic to merge two DataFrames based on the join type and column(s) 24 | // Example implementation: 25 | $mergedData = []; 26 | 27 | foreach ($this->data as $row1) { 28 | foreach ($other->data as $row2) { 29 | if ($on !== null && $row1[$on] === $row2[$on]) { 30 | $mergedRow = array_merge($row1, $row2); 31 | $mergedData[] = $mergedRow; 32 | } elseif ($on === null) { 33 | $mergedRow = array_merge($row1, $row2); 34 | $mergedData[] = $mergedRow; 35 | } 36 | } 37 | } 38 | 39 | return new self($mergedData, array_merge($this->columns, $other->columns)); 40 | } 41 | 42 | public function select(array $columns) { 43 | // Implement the logic to select a subset of columns from the DataFrame 44 | // Example implementation: 45 | $selectedData = []; 46 | 47 | foreach ($this->data as $row) { 48 | $selectedRow = []; 49 | foreach ($columns as $column) { 50 | $selectedRow[$column] = $row[$column]; 51 | } 52 | $selectedData[] = $selectedRow; 53 | } 54 | 55 | return new self($selectedData, $columns); 56 | } 57 | 58 | public function dropDuplicates() { 59 | // Implement the logic to remove duplicate rows from the DataFrame 60 | // Example implementation: 61 | $uniqueData = []; 62 | 63 | foreach ($this->data as $row) { 64 | if (!in_array($row, $uniqueData)) { 65 | $uniqueData[] = $row; 66 | } 67 | } 68 | 69 | return new self($uniqueData, $this->columns); 70 | } 71 | } 72 | 73 | class SNPAnalysis { 74 | public function calculateAlleleFrequencies(DataFrame $snps) { 75 | // Implement the logic to calculate allele frequencies for the given SNPs data 76 | // Example implementation: 77 | $alleleFrequencies = []; 78 | 79 | foreach ($snps->data as $snp) { 80 | $alleles = str_split($snp['genotype']); 81 | foreach ($alleles as $allele) { 82 | if (!isset($alleleFrequencies[$allele])) { 83 | $alleleFrequencies[$allele] = 0; 84 | } 85 | $alleleFrequencies[$allele]++; 86 | } 87 | } 88 | 89 | $totalAlleles = array_sum($alleleFrequencies); 90 | foreach ($alleleFrequencies as &$frequency) { 91 | $frequency /= $totalAlleles; 92 | } 93 | 94 | return $alleleFrequencies; 95 | } 96 | 97 | public function detectSNPBuild(DataFrame $snps) { 98 | // Implement the logic to detect the SNP build based on the given SNPs data 99 | // Example implementation: 100 | $buildCounts = []; 101 | 102 | foreach ($snps->data as $snp) { 103 | $build = $snp['build']; 104 | if (!isset($buildCounts[$build])) { 105 | $buildCounts[$build] = 0; 106 | } 107 | $buildCounts[$build]++; 108 | } 109 | 110 | $maxCount = 0; 111 | $detectedBuild = null; 112 | foreach ($buildCounts as $build => $count) { 113 | if ($count > $maxCount) { 114 | $maxCount = $count; 115 | $detectedBuild = $build; 116 | } 117 | } 118 | 119 | return $detectedBuild; 120 | } 121 | } 122 | 123 | class MathOperations { 124 | public function calculateStandardDeviation(array $data) { 125 | return Average::standardDeviation($data); 126 | } 127 | 128 | public function createMatrix(array $data) { 129 | return MatrixFactory::create($data); 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /src/Visualization.php: -------------------------------------------------------------------------------- 1 | validateInput($data, $filename, $format); 34 | $chromosomeCollections = $this->chromosomeCollections($data); 35 | $chromosomalFeatures = $this->patchChromosomalFeatures($data); 36 | 37 | // Visualization code... 38 | // (Implement the actual visualization logic here) 39 | 40 | $this->generateCSV($data, $filename); 41 | } catch (Exception $e) { 42 | throw new Exception("Error generating visualization: " . $e->getMessage()); 43 | } 44 | } 45 | 46 | /** 47 | * Validate input data for visualization 48 | * 49 | * @param array $data The SNP data to plot 50 | * @param string $filename The filename for the generated plot 51 | * @param string $format The image format for the plot 52 | * @throws Exception If input is invalid 53 | */ 54 | private function validateInput(array $data, string $filename, string $format): void 55 | { 56 | if (empty($data)) { 57 | throw new Exception("No data provided for visualization."); 58 | } 59 | if (empty($filename)) { 60 | throw new Exception("Filename is required for visualization output."); 61 | } 62 | if (!in_array(strtolower($format), ['png', 'jpeg', 'jpg'])) { 63 | throw new Exception("Invalid image format. Please use png, jpeg, or jpg."); 64 | } 65 | } 66 | 67 | /** 68 | * Generate chromosome collections for visualization 69 | * 70 | * @param array $data The SNP data 71 | * @return array Chromosome collections 72 | */ 73 | private function chromosomeCollections(array $data): array 74 | { 75 | $collections = []; 76 | $yPositions = $this->calculateYPositions($data); 77 | $height = 1; // Adjust as needed 78 | 79 | foreach ($data as $chrom => $group) { 80 | $yrange = [$yPositions[$chrom], $height]; 81 | $xranges = []; 82 | foreach ($group as $snp) { 83 | $xranges[] = ['start' => $snp['pos'], 'width' => 1]; // Assuming SNP position is a single point 84 | } 85 | $collections[] = ['xranges' => $xranges, 'yrange' => $yrange, 'colors' => $this->generateColorScheme(count($group))]; 86 | } 87 | return $collections; 88 | } 89 | 90 | /** 91 | * Calculate Y positions for chromosomes 92 | * 93 | * @param array $data The SNP data 94 | * @return array Y positions for each chromosome 95 | */ 96 | private function calculateYPositions(array $data): array 97 | { 98 | $yPositions = []; 99 | $currentY = 0; 100 | foreach (array_keys($data) as $chrom) { 101 | $yPositions[$chrom] = $currentY; 102 | $currentY += 2; // Adjust spacing as needed 103 | } 104 | return $yPositions; 105 | } 106 | 107 | /** 108 | * Patch chromosomal features for visualization 109 | * 110 | * @param array $data The SNP data 111 | * @return array Patched chromosomal features 112 | */ 113 | private function patchChromosomalFeatures(array $data): array 114 | { 115 | $features = []; 116 | foreach ($data as $chromosome => $snps) { 117 | $chromosomeLength = max(array_column($snps, 'pos')); 118 | $features[$chromosome][] = ['start' => 0, 'end' => $chromosomeLength, 'gie_stain' => 'gneg']; 119 | foreach ($snps as $snp) { 120 | $features[$chromosome][] = [ 121 | 'start' => $snp['pos'], 122 | 'end' => $snp['pos'] + 1, 123 | 'gie_stain' => 'snp' 124 | ]; 125 | } 126 | } 127 | return $features; 128 | } 129 | 130 | /** 131 | * Generate CSV file from matched data 132 | * 133 | * @param array $matchedData The matched SNP data 134 | * @param string $filename The filename for the generated plot 135 | */ 136 | private function generateCSV(array $matchedData, string $filename): void 137 | { 138 | $csvPath = str_replace(['.png', '.jpeg', '.jpg'], '.csv', $filename); 139 | CSVGenerator::generate($matchedData, $csvPath); 140 | } 141 | 142 | /** 143 | * Generate color scheme for visualization 144 | * 145 | * @param int $numColors Number of colors to generate 146 | * @return array Array of color strings 147 | */ 148 | private function generateColorScheme(int $numColors): array 149 | { 150 | $colors = []; 151 | for ($i = 0; $i < $numColors; $i++) { 152 | $hue = $i * (360 / $numColors); 153 | $colors[] = "hsl(" . $hue . ", 100%, 50%)"; 154 | } 155 | return $colors; 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /src/Snps/IO/Writer.php: -------------------------------------------------------------------------------- 1 | snps = $config['snps']; 27 | $this->filename = $config['filename'] ?? ''; 28 | $this->vcf = $config['vcf'] ?? false; 29 | $this->atomic = $config['atomic'] ?? true; 30 | $this->vcfAltUnavailable = $config['vcf_alt_unavailable'] ?? '.'; 31 | $this->vcfChromPrefix = $config['vcf_chrom_prefix'] ?? ''; 32 | $this->vcfQcOnly = $config['vcf_qc_only'] ?? false; 33 | $this->vcfQcFilter = $config['vcf_qc_filter'] ?? false; 34 | $this->kwargs = $kwargs; 35 | } 36 | 37 | /** 38 | * Write SNPs data to file 39 | * 40 | * @return array [path, extra_data] 41 | */ 42 | public function write(): array 43 | { 44 | if (empty($this->filename)) { 45 | $this->filename = $this->generateFilename(); 46 | } 47 | 48 | $snpsData = $this->snps->getSnps(); 49 | 50 | if (empty($snpsData)) { 51 | return ['', []]; 52 | } 53 | 54 | if ($this->vcf) { 55 | return $this->writeVcf($snpsData); 56 | } else { 57 | return $this->writeCsv($snpsData); 58 | } 59 | } 60 | 61 | /** 62 | * Write data as CSV/TSV format 63 | */ 64 | private function writeCsv(array $snpsData): array 65 | { 66 | $separator = $this->kwargs['sep'] ?? "\t"; 67 | $path = $this->ensureExtension($this->filename, $separator === ',' ? '.csv' : '.tsv'); 68 | 69 | $handle = fopen($path, 'w'); 70 | if (!$handle) { 71 | return ['', []]; 72 | } 73 | 74 | // Write header 75 | fputcsv($handle, ['rsid', 'chrom', 'pos', 'genotype'], $separator); 76 | 77 | // Write data 78 | foreach ($snpsData as $rsid => $snp) { 79 | fputcsv($handle, [ 80 | $rsid, 81 | $snp['chrom'] ?? '', 82 | $snp['pos'] ?? '', 83 | $snp['genotype'] ?? '' 84 | ], $separator); 85 | } 86 | 87 | fclose($handle); 88 | return [$path, []]; 89 | } 90 | 91 | /** 92 | * Write data as VCF format 93 | */ 94 | private function writeVcf(array $snpsData): array 95 | { 96 | $path = $this->ensureExtension($this->filename, '.vcf'); 97 | 98 | $handle = fopen($path, 'w'); 99 | if (!$handle) { 100 | return ['', []]; 101 | } 102 | 103 | // Write VCF header 104 | $this->writeVcfHeader($handle); 105 | 106 | $discrepantPositions = []; 107 | 108 | // Write data 109 | foreach ($snpsData as $rsid => $snp) { 110 | $vcfLine = $this->formatVcfLine($rsid, $snp); 111 | if ($vcfLine) { 112 | fwrite($handle, $vcfLine . "\n"); 113 | } else { 114 | $discrepantPositions[] = $snp; 115 | } 116 | } 117 | 118 | fclose($handle); 119 | return [$path, $discrepantPositions]; 120 | } 121 | 122 | /** 123 | * Write VCF header 124 | */ 125 | private function writeVcfHeader($handle): void 126 | { 127 | fwrite($handle, "##fileformat=VCFv4.2\n"); 128 | fwrite($handle, "##source=php-dna\n"); 129 | fwrite($handle, "##assembly=" . $this->snps->getAssembly() . "\n"); 130 | fwrite($handle, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n"); 131 | } 132 | 133 | /** 134 | * Format a single VCF line 135 | */ 136 | private function formatVcfLine(string $rsid, array $snp): ?string 137 | { 138 | $chrom = $this->vcfChromPrefix . ($snp['chrom'] ?? ''); 139 | $pos = $snp['pos'] ?? ''; 140 | $genotype = $snp['genotype'] ?? ''; 141 | 142 | if (empty($chrom) || empty($pos) || empty($genotype)) { 143 | return null; 144 | } 145 | 146 | // Simple VCF format - in a full implementation, this would need 147 | // reference genome lookup for REF/ALT alleles 148 | $ref = strlen($genotype) > 0 ? $genotype[0] : 'N'; 149 | $alt = strlen($genotype) > 1 ? $genotype[1] : $this->vcfAltUnavailable; 150 | 151 | if ($ref === $alt) { 152 | $alt = $this->vcfAltUnavailable; 153 | } 154 | 155 | $qual = '.'; 156 | $filter = $this->vcfQcFilter ? 'PASS' : '.'; 157 | $info = '.'; 158 | $format = 'GT'; 159 | $sample = $ref === $alt ? '0/0' : '0/1'; 160 | 161 | return implode("\t", [$chrom, $pos, $rsid, $ref, $alt, $qual, $filter, $info, $format, $sample]); 162 | } 163 | 164 | /** 165 | * Generate a filename if none provided 166 | */ 167 | private function generateFilename(): string 168 | { 169 | $timestamp = date('Y-m-d_H-i-s'); 170 | $extension = $this->vcf ? '.vcf' : '.tsv'; 171 | return "snps_output_{$timestamp}{$extension}"; 172 | } 173 | 174 | /** 175 | * Ensure filename has the correct extension 176 | */ 177 | private function ensureExtension(string $filename, string $extension): string 178 | { 179 | if (!str_ends_with($filename, $extension)) { 180 | $filename .= $extension; 181 | } 182 | return $filename; 183 | } 184 | } -------------------------------------------------------------------------------- /tests/SNPsTest.php: -------------------------------------------------------------------------------- 1 | snps = new SNPs(); 18 | } 19 | 20 | public function testEmptySNPs(): void 21 | { 22 | $this->assertFalse($this->snps->isValid()); 23 | $this->assertEquals(0, $this->snps->count()); 24 | $this->assertEmpty($this->snps->getSnps()); 25 | } 26 | 27 | public function testSetSnps(): void 28 | { 29 | $testData = [ 30 | 'rs123' => ['rsid' => 'rs123', 'chrom' => '1', 'pos' => 1000, 'genotype' => 'AA'], 31 | 'rs456' => ['rsid' => 'rs456', 'chrom' => '2', 'pos' => 2000, 'genotype' => 'AT'], 32 | ]; 33 | 34 | $this->snps->setSnps($testData); 35 | 36 | $this->assertTrue($this->snps->isValid()); 37 | $this->assertEquals(2, $this->snps->count()); 38 | $this->assertEquals($testData, $this->snps->getSnps()); 39 | } 40 | 41 | public function testBuildDetection(): void 42 | { 43 | // Test with known SNP positions for build detection 44 | $testData = [ 45 | 'rs3094315' => ['rsid' => 'rs3094315', 'chrom' => '1', 'pos' => 752566, 'genotype' => 'AA'], // Build 37 46 | ]; 47 | 48 | $this->snps->setSnps($testData); 49 | $build = $this->snps->detect_build(); 50 | 51 | $this->assertEquals(37, $build); 52 | } 53 | 54 | public function testSexDetermination(): void 55 | { 56 | // Test female determination (heterozygous X chromosome) 57 | $femaleData = [ 58 | 'rs1' => ['rsid' => 'rs1', 'chrom' => 'X', 'pos' => 1000, 'genotype' => 'AT'], 59 | 'rs2' => ['rsid' => 'rs2', 'chrom' => 'X', 'pos' => 2000, 'genotype' => 'CG'], 60 | ]; 61 | 62 | $this->snps->setSnps($femaleData); 63 | $sex = $this->snps->determine_sex(); 64 | $this->assertEquals('Female', $sex); 65 | 66 | // Test male determination (homozygous X chromosome) 67 | $maleData = [ 68 | 'rs3' => ['rsid' => 'rs3', 'chrom' => 'X', 'pos' => 3000, 'genotype' => 'AA'], 69 | 'rs4' => ['rsid' => 'rs4', 'chrom' => 'X', 'pos' => 4000, 'genotype' => 'TT'], 70 | ]; 71 | 72 | $this->snps->setSnps($maleData); 73 | $sex = $this->snps->determine_sex(); 74 | $this->assertEquals('Male', $sex); 75 | } 76 | 77 | public function testChromosomeCounting(): void 78 | { 79 | $testData = [ 80 | 'rs1' => ['rsid' => 'rs1', 'chrom' => '1', 'pos' => 1000, 'genotype' => 'AA'], 81 | 'rs2' => ['rsid' => 'rs2', 'chrom' => '1', 'pos' => 2000, 'genotype' => 'AT'], 82 | 'rs3' => ['rsid' => 'rs3', 'chrom' => 'X', 'pos' => 3000, 'genotype' => 'AA'], 83 | ]; 84 | 85 | $this->snps->setSnps($testData); 86 | 87 | $this->assertEquals(2, $this->snps->get_count('1')); 88 | $this->assertEquals(1, $this->snps->get_count('X')); 89 | $this->assertEquals(3, $this->snps->get_count()); 90 | } 91 | 92 | public function testHeterozygousHomozygous(): void 93 | { 94 | $testData = [ 95 | 'rs1' => ['rsid' => 'rs1', 'chrom' => '1', 'pos' => 1000, 'genotype' => 'AA'], // Homozygous 96 | 'rs2' => ['rsid' => 'rs2', 'chrom' => '1', 'pos' => 2000, 'genotype' => 'AT'], // Heterozygous 97 | 'rs3' => ['rsid' => 'rs3', 'chrom' => '1', 'pos' => 3000, 'genotype' => 'TT'], // Homozygous 98 | ]; 99 | 100 | $this->snps->setSnps($testData); 101 | 102 | $heterozygous = $this->snps->heterozygous('1'); 103 | $homozygous = $this->snps->homozygous('1'); 104 | 105 | $this->assertEquals(1, count($heterozygous)); 106 | $this->assertEquals(2, count($homozygous)); 107 | $this->assertArrayHasKey('rs2', $heterozygous); 108 | $this->assertArrayHasKey('rs1', $homozygous); 109 | $this->assertArrayHasKey('rs3', $homozygous); 110 | } 111 | 112 | public function testSorting(): void 113 | { 114 | $testData = [ 115 | 'rs3' => ['rsid' => 'rs3', 'chrom' => '2', 'pos' => 1000, 'genotype' => 'AA'], 116 | 'rs1' => ['rsid' => 'rs1', 'chrom' => '1', 'pos' => 2000, 'genotype' => 'AT'], 117 | 'rs2' => ['rsid' => 'rs2', 'chrom' => '1', 'pos' => 1000, 'genotype' => 'TT'], 118 | 'rs4' => ['rsid' => 'rs4', 'chrom' => 'X', 'pos' => 500, 'genotype' => 'GG'], 119 | ]; 120 | 121 | $this->snps->setSnps($testData); 122 | $this->snps->sort(); 123 | 124 | $sortedSnps = $this->snps->getSnps(); 125 | $keys = array_keys($sortedSnps); 126 | 127 | // Should be sorted by chromosome then position 128 | // Expected order: rs2 (chr1:1000), rs1 (chr1:2000), rs3 (chr2:1000), rs4 (chrX:500) 129 | $this->assertEquals('rs2', $keys[0]); 130 | $this->assertEquals('rs1', $keys[1]); 131 | $this->assertEquals('rs3', $keys[2]); 132 | $this->assertEquals('rs4', $keys[3]); 133 | } 134 | 135 | public function testGetAssembly(): void 136 | { 137 | $this->snps->setBuild(37); 138 | $this->assertEquals('GRCh37', $this->snps->getAssembly()); 139 | 140 | $this->snps->setBuild(38); 141 | $this->assertEquals('GRCh38', $this->snps->getAssembly()); 142 | 143 | $this->snps->setBuild(36); 144 | $this->assertEquals('NCBI36', $this->snps->getAssembly()); 145 | } 146 | 147 | public function testGetSummary(): void 148 | { 149 | $testData = [ 150 | 'rs1' => ['rsid' => 'rs1', 'chrom' => '1', 'pos' => 1000, 'genotype' => 'AA'], 151 | 'rs2' => ['rsid' => 'rs2', 'chrom' => 'X', 'pos' => 2000, 'genotype' => 'AT'], 152 | ]; 153 | 154 | $this->snps->setSnps($testData); 155 | $this->snps->setBuild(37); 156 | 157 | $summary = $this->snps->getSummary(); 158 | 159 | $this->assertIsArray($summary); 160 | $this->assertEquals('GRCh37', $summary['assembly']); 161 | $this->assertEquals(37, $summary['build']); 162 | $this->assertEquals(2, $summary['count']); 163 | $this->assertArrayHasKey('chromosomes', $summary); 164 | $this->assertArrayHasKey('sex', $summary); 165 | } 166 | } -------------------------------------------------------------------------------- /src/MatchKits.php: -------------------------------------------------------------------------------- 1 | visualization = $visualization; 44 | $this->triangulation = $triangulation; 45 | } 46 | 47 | /** 48 | * Match the loaded DNA kits 49 | * 50 | * @throws Exception If less than two kits are loaded 51 | */ 52 | public function matchKits(): void 53 | { 54 | if (count($this->kitsData) < 2) { 55 | throw new Exception("At least two DNA kits are required for matching."); 56 | } 57 | 58 | $this->matchedData = []; // Reset matched data 59 | 60 | try { 61 | foreach ($this->kitsData[0]->getSnps() as $snp1) { 62 | foreach ($this->kitsData[1]->getSnps() as $snp2) { 63 | if ($snp1['pos'] === $snp2['pos'] && $snp1['genotype'] === $snp2['genotype']) { 64 | $this->matchedData[] = $snp1; 65 | } 66 | } 67 | } 68 | } catch (Exception $e) { 69 | throw new Exception("Error matching DNA kits: " . $e->getMessage()); 70 | } 71 | } 72 | 73 | /** 74 | * @return array The matched SNP data 75 | */ 76 | public function getMatchedData(): array 77 | { 78 | return $this->matchedData; 79 | } 80 | 81 | /** 82 | * Load DNA kit data 83 | * 84 | * @param SNPs[] $kitsData The kit data to load 85 | * @throws Exception If the input is not an array of SNPs objects 86 | */ 87 | public function setKitsData(array $kitsData): void 88 | { 89 | foreach ($kitsData as $kit) { 90 | if (!$kit instanceof SNPs) { 91 | throw new Exception("Invalid input: All elements must be instances of SNPs class."); 92 | } 93 | } 94 | $this->kitsData = $kitsData; 95 | } 96 | 97 | /** 98 | * Triangulate kits 99 | * 100 | * @throws Exception If less than three kits are loaded 101 | */ 102 | public function triangulateKits(): void 103 | { 104 | if (count($this->kitsData) < 3) { 105 | throw new Exception("At least three DNA kits are required for triangulation."); 106 | } 107 | 108 | try { 109 | $this->matchedData = $this->triangulation->compareMultipleKits($this->kitsData); 110 | } catch (Exception $e) { 111 | throw new Exception("Error triangulating DNA kits: " . $e->getMessage()); 112 | } 113 | } 114 | 115 | /** 116 | * Visualize matched data 117 | * 118 | * @param string $filename The filename for the generated plot 119 | * @param string $title The title for the plot 120 | * @param string $build The genome build version 121 | * @param string $format The image format for the plot 122 | * @throws Exception If visualization fails 123 | */ 124 | public function visualizeMatchedData(string $filename, string $title, string $build, string $format): void 125 | { 126 | if (empty($this->matchedData)) { 127 | throw new Exception("No matched data to visualize. Run matchKits() or triangulateKits() first."); 128 | } 129 | 130 | try { 131 | $this->visualization->plotChromosomes($this->matchedData, $filename, $title, $build, $format); 132 | } catch (Exception $e) { 133 | throw new Exception("Error visualizing matched data: " . $e->getMessage()); 134 | } 135 | } 136 | } 137 | 138 | if (php_sapi_name() == "cli") { 139 | try { 140 | $visualization = new Visualization(); 141 | $triangulation = new Triangulation(); 142 | $matchKits = new MatchKits($visualization, $triangulation); 143 | 144 | echo "Enter the number of kits to compare (2 for matching, 3 or more for triangulation): "; 145 | $numKits = intval(trim(fgets(STDIN))); 146 | 147 | if ($numKits < 2) { 148 | throw new Exception("At least two kits are required for comparison."); 149 | } 150 | 151 | $kitPaths = []; 152 | for ($i = 0; $i < $numKits; $i++) { 153 | echo "Enter file path for Kit " . ($i + 1) . ": "; 154 | $path = trim(fgets(STDIN)); 155 | if (!file_exists($path)) { 156 | throw new Exception("File not found: $path"); 157 | } 158 | $kitPaths[] = $path; 159 | } 160 | 161 | $kitsData = array_map(function($path) { 162 | return new SNPs($path); 163 | }, $kitPaths); 164 | 165 | $matchKits->setKitsData($kitsData); 166 | 167 | if ($numKits == 2) { 168 | $matchKits->matchKits(); 169 | echo "Matching kits...\n"; 170 | } else { 171 | $matchKits->triangulateKits(); 172 | echo "Triangulating kits...\n"; 173 | } 174 | 175 | echo "Enter filename for the visualization (e.g., matched_data.png): "; 176 | $filename = trim(fgets(STDIN)); 177 | 178 | echo "Enter title for the plot: "; 179 | $title = trim(fgets(STDIN)); 180 | 181 | echo "Enter genome build version (e.g., GRCh37): "; 182 | $build = trim(fgets(STDIN)); 183 | 184 | echo "Enter image format (png, jpeg, or jpg): "; 185 | $format = strtolower(trim(fgets(STDIN))); 186 | 187 | if (!in_array($format, ['png', 'jpeg', 'jpg'])) { 188 | throw new Exception("Invalid image format. Please use png, jpeg, or jpg."); 189 | } 190 | 191 | $matchKits->visualizeMatchedData($filename, $title, $build, $format); 192 | 193 | echo "Matched data visualization has been generated: $filename\n"; 194 | } catch (Exception $e) { 195 | echo "Error: " . $e->getMessage() . "\n"; 196 | exit(1); 197 | } 198 | } 199 | ?> 200 | public function triangulateKits() { 201 | $this->matchedData = []; // Initialize matched data array 202 | $snpsLists = array_map(function($kit) { return $kit->getSnps(); }, $this->kitsData); 203 | $commonSnps = call_user_func_array('array_intersect_key', $snpsLists); 204 | foreach ($commonSnps as $snp) { 205 | $this->matchedData[] = $snp; // Add common SNP to matched data 206 | } 207 | } 208 | -------------------------------------------------------------------------------- /tests/Snps/IO/ReaderTest.php: -------------------------------------------------------------------------------- 1 | init_resource_attributes(); 19 | 20 | Utils::gzip_file( 21 | "tests/resources/gsa_rsid_map.txt", 22 | $resourcesDir . "/gsa_rsid_map.txt.gz" 23 | ); 24 | Utils::gzip_file( 25 | "tests/resources/gsa_chrpos_map.txt", 26 | $resourcesDir . "/gsa_chrpos_map.txt.gz" 27 | ); 28 | Utils::gzip_file( 29 | "tests/resources/dbsnp_151_37_reverse.txt", 30 | $resourcesDir . "/dbsnp_151_37_reverse.txt.gz" 31 | ); 32 | } 33 | 34 | public static function teardownGsaTest() { 35 | $r = new Resources( 36 | resources_dir: "resources", 37 | ); 38 | $r->init_resource_attributes(); 39 | } 40 | 41 | public function testRead23AndMe() 42 | { 43 | $this->run_parse_tests("tests/input/23andme.txt", "23andMe"); 44 | } 45 | 46 | public function testRead23AndMeAllele() 47 | { 48 | $this->run_parse_tests("tests/input/23andme_allele.txt", "23andMe"); 49 | } 50 | 51 | public function testRead23AndMeWin() 52 | { 53 | $this->run_parse_tests("tests/input/23andme_win.txt", "23andMe"); 54 | } 55 | 56 | protected function run_build_detection_test( 57 | $run_parsing_tests_func, 58 | $build_str, 59 | $build_int, 60 | $file="tests/input/testvcf.vcf", 61 | $source="vcf", 62 | $comment_str="##%s\n", 63 | $insertion_line=1 64 | ) { 65 | $s = ""; 66 | $lines = file($file); 67 | foreach ($lines as $i => $line) { 68 | $s .= $line; 69 | if ($i == $insertion_line) { 70 | $s .= sprintf($comment_str, $build_str); 71 | } 72 | } 73 | 74 | $file_build_comment = tempnam(sys_get_temp_dir(), basename($file)); 75 | file_put_contents($file_build_comment, $s); 76 | 77 | call_user_func( 78 | $run_parsing_tests_func, 79 | $file_build_comment, 80 | $source, 81 | $build_int, 82 | true 83 | ); 84 | } 85 | 86 | public function testRead23AndMeBuild36() 87 | { 88 | $this->run_build_detection_test( 89 | array($this, 'run_parse_tests'), 90 | "build 36", 91 | 36, 92 | "tests/input/23andme.txt", 93 | "23andMe", 94 | "# %s\n" 95 | ); 96 | } 97 | 98 | 99 | public function testRead23AndMeBuild37() 100 | { 101 | $this->run_build_detection_test( 102 | array($this, 'run_parse_tests'), 103 | "build 37", 104 | 37, 105 | "tests/input/23andme.txt", 106 | "23andMe", 107 | "# %s\n" 108 | ); 109 | } 110 | 111 | public function testRead23AndMeBuild38() 112 | { 113 | $this->run_build_detection_test( 114 | array($this, 'run_parse_tests'), 115 | "build 38", 116 | 38, 117 | "tests/input/23andme.txt", 118 | "23andMe", 119 | "# %s\n" 120 | ); 121 | } 122 | 123 | public function testReadAncestry() 124 | { 125 | // https://www.ancestry.com 126 | $this->run_parse_tests("tests/input/ancestry.txt", "AncestryDNA"); 127 | } 128 | 129 | public function testReadAncestryExtraTab() 130 | { 131 | $total_snps = 100; 132 | $s = "#AncestryDNA\r\n"; 133 | $s .= "rsid\tchromosome\tposition\tallele1\tallele2\r\n"; 134 | // add extra tab separator in first line 135 | $s .= "rs1\t1\t101\t\tA\tA\r\n"; 136 | // generate remainder of lines 137 | for ($i = 1; $i < $total_snps; $i++) { 138 | $s .= "rs" . (1 + $i) . "\t1\t" . (101 + $i) . "\tA\tA\r\n"; 139 | } 140 | 141 | $snps_df = $this->create_snp_df( 142 | array_map(function ($i) { 143 | return "rs" . (1 + $i); 144 | }, range(0, $total_snps - 1)), 145 | "1", 146 | array_map(function ($i) { 147 | return 101 + $i; 148 | }, range(0, $total_snps - 1)), 149 | "AA" 150 | ); 151 | 152 | // echo "snps_df: \n"; 153 | // print_r($snps_df); 154 | 155 | $path = tempnam(sys_get_temp_dir(), "ancestry_extra_tab.txt"); 156 | file_put_contents($path, $s); 157 | 158 | $this->run_parse_tests($path, "AncestryDNA", snps_df: $snps_df); 159 | } 160 | 161 | public function testReadAncestryMultiSep() 162 | { 163 | // https://www.ancestry.com 164 | $this->run_parse_tests("tests/input/ancestry_multi_sep.txt", "AncestryDNA"); 165 | } 166 | 167 | public function testReadCodigo46() 168 | { 169 | // https://codigo46.com.mx 170 | static::setupGsaTest(sys_get_temp_dir()); 171 | // $this->run_parse_tests("tests/input/codigo46.txt", "Codigo46"); 172 | static::teardownGsaTest(); 173 | } 174 | 175 | // def test_read_tellmeGen(self): 176 | // # https://www.tellmegen.com/ 177 | // with tempfile.TemporaryDirectory() as tmpdir: 178 | // self._setup_gsa_test(tmpdir) 179 | // self.run_parsing_tests("tests/input/tellmeGen.txt", "tellmeGen") 180 | // self._teardown_gsa_test() 181 | 182 | // def test_read_DNALand(self): 183 | // # https://dna.land/ 184 | // self.run_parsing_tests("tests/input/DNALand.txt", "DNA.Land") 185 | 186 | // def test_read_circledna(self): 187 | // # https://circledna.com/ 188 | // df = self.generic_snps() 189 | // df.drop("rs5", inplace=True) # only called genotypes 190 | // self.run_parsing_tests("tests/input/circledna.txt", "CircleDNA", snps_df=df) 191 | 192 | // def test_read_ftdna(self): 193 | // # https://www.familytreedna.com 194 | // self.run_parsing_tests("tests/input/ftdna.csv", "FTDNA") 195 | 196 | // def test_read_ftdna_concat_gzip_extra_data(self): 197 | // # https://www.familytreedna.com 198 | 199 | // total_snps1 = 10 200 | // total_snps2 = 10 201 | // # generate content of first file 202 | // s1 = "RSID,CHROMOSOME,POSITION,RESULT\r\n" 203 | // for i in range(0, total_snps1): 204 | // s1 += f'"rs{1 + i}","1","{101 + i}","AA"\r\n' 205 | 206 | // # generate content of second file 207 | // s2 = "RSID,CHROMOSOME,POSITION,RESULT\r\n" 208 | // for i in range(0, total_snps2): 209 | // s2 += f'"rs{total_snps1 + 1 + i}","1","{ total_snps1 + 101 + i}","AA"\r\n' 210 | // snps_df = self.create_snp_df( 211 | // rsid=[f"rs{1 + i}" for i in range(0, total_snps1 + total_snps2)], 212 | // chrom="1", 213 | // pos=[101 + i for i in range(0, total_snps1 + total_snps2)], 214 | // genotype="AA", 215 | // ) 216 | 217 | // with tempfile.TemporaryDirectory() as tmpdir: 218 | // file1 = os.path.join(tmpdir, "ftdna_concat_gzip1.csv") 219 | // file1_gz = f"{file1}.gz" 220 | // file2 = os.path.join(tmpdir, "ftdna_concat_gzip2.csv") 221 | // file2_gz = f"{file2}.gz" 222 | // path = os.path.join(tmpdir, "ftdna_concat_gzip.csv.gz") 223 | 224 | // # write individual files 225 | // with open(file1, "w") as f: 226 | // f.write(s1) 227 | // with open(file2, "w") as f: 228 | // f.write(s2) 229 | 230 | // # compress files 231 | // gzip_file(file1, file1_gz) 232 | // gzip_file(file2, file2_gz) 233 | 234 | // # concatenate gzips 235 | // with open(file1_gz, "rb") as f: 236 | // data = f.read() 237 | // with open(file2_gz, "rb") as f: 238 | // data += f.read() 239 | 240 | // # add extra data 241 | // data += b"extra data" 242 | 243 | // # write file with concatenated gzips and extra data 244 | // with open(path, "wb") as f: 245 | // f.write(data) 246 | 247 | // self.make_parsing_assertions( 248 | // self.parse_file(path), "FTDNA", False, 37, False, snps_df 249 | // ) 250 | // self.make_parsing_assertions( 251 | // self.parse_bytes(path), "FTDNA", False, 37, False, snps_df 252 | // ) 253 | 254 | // def test_read_ftdna_famfinder(self): 255 | // # https://www.familytreedna.com 256 | // self.run_parsing_tests("tests/input/ftdna_famfinder.csv", "FTDNA") 257 | 258 | // def test_read_ftdna_second_header(self): 259 | // # https://www.familytreedna.com 260 | 261 | // total_snps1 = 100 262 | // total_snps2 = 10 263 | // s = "RSID,CHROMOSOME,POSITION,RESULT\n" 264 | // # generate first chunk of lines 265 | // for i in range(0, total_snps1): 266 | // s += f'"rs{1 + i}","1","{101 + i}","AA"\n' 267 | // # add second header 268 | // s += "RSID,CHROMOSOME,POSITION,RESULT\n" 269 | // # generate second chunk of lines 270 | // for i in range(0, total_snps2): 271 | // s += f'"rs{total_snps1 + 1 + i}","1","{total_snps1 + 101 + i}","AA"\n' 272 | 273 | // snps_df = self.create_snp_df( 274 | // rsid=[f"rs{1 + i}" for i in range(0, total_snps1 + total_snps2)], 275 | // chrom="1", 276 | // pos=[101 + i for i in range(0, total_snps1 + total_snps2)], 277 | // genotype="AA", 278 | // ) 279 | 280 | // with tempfile.TemporaryDirectory() as tmpdir: 281 | // path = os.path.join(tmpdir, "ftdna_second_header.txt") 282 | // with open(path, "w") as f: 283 | // f.write(s) 284 | 285 | // self.run_parsing_tests(path, "FTDNA", snps_df=snps_df) 286 | 287 | 288 | 289 | public function testReadGenericCsv() 290 | { 291 | $this->run_parse_tests("tests/input/generic.csv", "generic"); 292 | } 293 | 294 | public function testReadGenericTsv() 295 | { 296 | $this->run_parse_tests("tests/input/generic.tsv", "generic"); 297 | } 298 | 299 | public function testReadGenericExtraColumnTsv() 300 | { 301 | $this->run_parse_tests("tests/input/generic_extra_column.tsv", "generic"); 302 | } 303 | 304 | public function testReadGenericHeaderComment() 305 | { 306 | $this->run_parse_tests("tests/input/generic_header_comment.tsv", "generic"); 307 | } 308 | 309 | public function testReadGenericMultiRsidTsv() 310 | { 311 | $this->run_parse_tests("tests/input/generic_multi_rsid.tsv", "generic"); 312 | } 313 | 314 | public function testReadGenericNoHeader() 315 | { 316 | $this->run_parse_tests("tests/input/generic_no_header.tsv", "generic"); 317 | } 318 | 319 | public function testReadGenericNonStandardColumns() 320 | { 321 | $this->run_parse_tests( 322 | "tests/input/generic_non_standard_columns.tsv", 323 | "generic" 324 | ); 325 | } 326 | 327 | } -------------------------------------------------------------------------------- /tests/Snps/IO/WriterTes.php: -------------------------------------------------------------------------------- 1 | _reference_sequences["GRCh37"] = []; 27 | 28 | $output = $tmpdir1 . '/' . $outputFile; 29 | $tmpdir2 = sys_get_temp_dir() . '/' . uniqid(); 30 | mkdir($tmpdir2); 31 | 32 | $dest = $tmpdir2 . '/generic.fa.gz'; 33 | gzip_file("tests/input/generic.fa", $dest); 34 | 35 | $seq = new ReferenceSequence( 36 | "1", 37 | "", 38 | $dest 39 | ); 40 | 41 | $r->_reference_sequences["GRCh37"]["1"] = $seq; 42 | 43 | if (!$filename) { 44 | $result = $s->{$funcStr}($kwargs); 45 | } else { 46 | $result = $s->{$funcStr}($filename, $kwargs); 47 | } 48 | 49 | $this->assertSame($result, $output); 50 | 51 | if ($expectedOutput) { 52 | // Read result 53 | $actual = file_get_contents($output); 54 | 55 | // Read expected result 56 | $expected = file_get_contents($expectedOutput); 57 | 58 | $this->assertStringContainsString($expected, $actual); 59 | } 60 | 61 | 62 | $this->runParsingTestsVcf($output); 63 | } else { 64 | $tmpdir = sys_get_temp_dir() . '/' . uniqid(); 65 | mkdir($tmpdir); 66 | 67 | $snps = new SNPs("tests/input/generic.csv", output_dir: $tmpdir); 68 | $dest = $tmpdir . '/' . $outputFile; 69 | 70 | if (!$filename) { 71 | $this->assertSame($snps->{$funcStr}(), $dest); 72 | } else { 73 | $this->assertSame($snps->{$funcStr}($filename), $dest); 74 | } 75 | 76 | 77 | $this->run_parse_tests($dest, "generic"); 78 | } 79 | } 80 | 81 | public function testToCsv() 82 | { 83 | $this->runWriterTest("to_csv", outputFile: "generic_GRCh37.csv"); 84 | } 85 | 86 | public function testToCsvFilename() 87 | { 88 | $this->runWriterTest("to_csv", filename: "generic.csv", outputFile: "generic.csv"); 89 | } 90 | 91 | public function testToTsv() 92 | { 93 | $this->runWriterTest("to_tsv", outputFile: "generic_GRCh37.txt"); 94 | } 95 | 96 | public function testToTsvFilename() 97 | { 98 | $this->runWriterTest("to_tsv", filename: "generic.txt", outputFile: "generic.txt"); 99 | } 100 | 101 | public function testToVcf() 102 | { 103 | $this->runWriterTest( 104 | "to_vcf", 105 | outputFile: "vcf_GRCh37.vcf", 106 | expectedOutput: "tests/output/vcf_generic.vcf" 107 | ); 108 | } 109 | 110 | public function testToVcfFilename() 111 | { 112 | $this->runWriterTest("to_vcf", filename: "vcf.vcf", outputFile: "vcf.vcf"); 113 | } 114 | 115 | public function testToVcfChromPrefix() 116 | { 117 | $this->runWriterTest( 118 | "to_vcf", 119 | outputFile: "vcf_GRCh37.vcf", 120 | expectedOutput: "tests/output/vcf_chrom_prefix_chr.vcf", 121 | kwargs: ["chrom_prefix" => "chr"] 122 | ); 123 | } 124 | 125 | public function testSaveSnpsFalsePositiveBuild() 126 | { 127 | // Create a temporary directory 128 | $tmpdir = tempnam(sys_get_temp_dir(), 'tmp'); 129 | unlink($tmpdir); 130 | mkdir($tmpdir); 131 | 132 | // Instantiate SNPs with input file and output directory 133 | $snps = new SNPs("tests/input/generic.csv", ["output_dir" => $tmpdir]); 134 | 135 | // Define output file path 136 | $output = $tmpdir . "/generic_GRCh37.txt"; 137 | 138 | // Save SNPs to TSV 139 | $this->assertEquals($output, $snps->toTsv()); 140 | 141 | // Modify the output file to add version information 142 | $s = ""; 143 | $lines = file($output); 144 | foreach ($lines as $line) { 145 | if (strpos($line, "snps v") !== false) { 146 | $s .= "# Generated by snps v1.2.3.post85.dev0+gb386302, https://pypi.org/project/snps/\n"; 147 | } else { 148 | $s .= $line; 149 | } 150 | } 151 | 152 | file_put_contents($output, $s); 153 | 154 | // Run parsing tests on the modified output 155 | $this->runParsingTests($output, "generic"); 156 | 157 | // Clean up 158 | unlink($output); 159 | rmdir($tmpdir); 160 | } 161 | 162 | public function testSaveSnpsVcfFalsePositiveBuild() 163 | { 164 | $tmpdir1 = sys_get_temp_dir() . '/tmpdir1'; 165 | mkdir($tmpdir1); 166 | 167 | // Instantiate SNPs with input file and output directory 168 | $snps = new SNPs("tests/input/testvcf.vcf", ["output_dir" => $tmpdir1]); 169 | 170 | $r = new Resources(); 171 | $r->_reference_sequences["GRCh37"] = []; 172 | 173 | $output = $tmpdir1 . "/vcf_GRCh37.vcf"; 174 | $tmpdir2 = sys_get_temp_dir() . '/tmpdir2'; 175 | mkdir($tmpdir2); 176 | 177 | $dest = $tmpdir2 . "/generic.fa.gz"; 178 | gzip_file("tests/input/generic.fa", $dest); 179 | 180 | $seq = new ReferenceSequence(["ID" => "1", "path" => $dest]); 181 | 182 | $r->_reference_sequences["GRCh37"]["1"] = $seq; 183 | 184 | $this->assertEquals($output, $snps->toVcf()); 185 | 186 | $s = ""; 187 | $lines = file($output); 188 | foreach ($lines as $line) { 189 | if (strpos($line, "snps v") !== false) { 190 | $s .= '##source="vcf; snps v1.2.3.post85.dev0+gb386302; https://pypi.org/project/snps/"' . "\n"; 191 | } else { 192 | $s .= $line; 193 | } 194 | } 195 | 196 | file_put_contents($output, $s); 197 | 198 | $this->runParsingTestsVcf($output); 199 | 200 | // Clean up 201 | unlink($output); 202 | rmdir($tmpdir1); 203 | unlink($dest); 204 | rmdir($tmpdir2); 205 | } 206 | 207 | 208 | public function testSaveSnpsVcfPhased() 209 | { 210 | $tmpdir1 = sys_get_temp_dir() . '/tmpdir1'; 211 | mkdir($tmpdir1); 212 | 213 | // Instantiate SNPs with input phased VCF file and output directory 214 | $s = new SNPs("tests/input/testvcf_phased.vcf", ["output_dir" => $tmpdir1]); 215 | 216 | // Setup resource to use test FASTA reference sequence 217 | $r = new Resources(); 218 | $r->_reference_sequences["GRCh37"] = []; 219 | 220 | $output = $tmpdir1 . "/vcf_GRCh37.vcf"; 221 | $tmpdir2 = sys_get_temp_dir() . '/tmpdir2'; 222 | mkdir($tmpdir2); 223 | 224 | $dest = $tmpdir2 . "/generic.fa.gz"; 225 | gzip_file("tests/input/generic.fa", $dest); 226 | 227 | $seq = new ReferenceSequence(["ID" => "1", "path" => $dest]); 228 | 229 | $r->_reference_sequences["GRCh37"]["1"] = $seq; 230 | 231 | // Save phased data to VCF 232 | $this->assertEquals($output, $s->toVcf()); 233 | 234 | // Read saved VCF with phased data 235 | $this->runParsingTestsVcf($output, true); 236 | 237 | // Clean up 238 | unlink($output); 239 | rmdir($tmpdir1); 240 | unlink($dest); 241 | rmdir($tmpdir2); 242 | } 243 | 244 | 245 | public function testSaveSnpsPhased() 246 | { 247 | $tmpdir = sys_get_temp_dir() . '/tmpdir'; 248 | mkdir($tmpdir); 249 | 250 | // Instantiate SNPs with input phased VCF file and output directory 251 | $s = new SNPs("tests/input/testvcf_phased.vcf", ["output_dir" => $tmpdir]); 252 | 253 | $dest = $tmpdir . "/vcf_GRCh37.txt"; 254 | 255 | // Save phased data to TSV 256 | $this->assertEquals($dest, $s->toTsv()); 257 | 258 | // Read saved TSV with phased data 259 | $this->runParsingTestsVcf($dest, true); 260 | 261 | // Clean up 262 | unlink($dest); 263 | rmdir($tmpdir); 264 | } 265 | 266 | 267 | public function runVcfQcTest($expectedOutput, $vcfQcOnly, $vcfQcFilter, $cluster = "c1") 268 | { 269 | $tmpdir1 = sys_get_temp_dir() . '/tmpdir1'; 270 | mkdir($tmpdir1); 271 | 272 | // Instantiate SNPs with input CSV file and output directory 273 | $s = new SNPs("tests/input/generic.csv", ["output_dir" => $tmpdir1]); 274 | 275 | // Setup resource to use test FASTA reference sequence 276 | $r = new Resources(); 277 | $r->setReferenceSequences(["GRCh37" => []]); 278 | 279 | $output = $tmpdir1 . "/generic_GRCh37.vcf"; 280 | 281 | $tmpdir2 = sys_get_temp_dir() . '/tmpdir2'; 282 | mkdir($tmpdir2); 283 | $dest = $tmpdir2 . "/generic.fa.gz"; 284 | gzipFile("tests/input/generic.fa", $dest); 285 | 286 | $seq = new ReferenceSequence(ID: "1", path: $dest); 287 | $r->getReferenceSequences("GRCh37")["1"] = $seq; 288 | 289 | // Save data to VCF with quality control settings 290 | $options = ["qc_only" => $vcfQcOnly, "qc_filter" => $vcfQcFilter]; 291 | $this->assertEquals($output, $s->toVcf($options)); 292 | 293 | // Read result 294 | $actual = file_get_contents($output); 295 | 296 | // Read expected result 297 | $expected = file_get_contents($expectedOutput); 298 | 299 | $this->assertStringContainsString($expected, $actual); 300 | 301 | if (!$vcfQcFilter || !$cluster) { 302 | $this->assertStringNotContainsString("##FILTER=runVcfQcTest( 315 | "tests/output/vcf_qc/qc_only_F_qc_filter_F.vcf", 316 | false, 317 | false 318 | ); 319 | } 320 | 321 | public function testSaveVcfQcOnlyFalseQcFilterTrue() 322 | { 323 | $this->runVcfQcTest( 324 | "tests/output/vcf_qc/qc_only_F_qc_filter_T.vcf", 325 | false, 326 | true 327 | ); 328 | } 329 | 330 | public function testSaveVcfQcOnlyTrueQcFilterFalse() 331 | { 332 | $this->runVcfQcTest( 333 | "tests/output/vcf_qc/qc_only_T_qc_filter_F.vcf", 334 | true, 335 | false 336 | ); 337 | } 338 | 339 | public function testSaveVcfQcOnlyTrueQcFilterTrue() 340 | { 341 | $this->runVcfQcTest( 342 | "tests/output/vcf_qc/qc_only_T_qc_filter_T.vcf", 343 | true, 344 | true 345 | ); 346 | } 347 | 348 | public function testSaveVcfNoClusterQcOnlyFalseQcFilterFalse() 349 | { 350 | $this->runVcfQcTest( 351 | "tests/output/vcf_qc/qc_only_F_qc_filter_F.vcf", 352 | false, 353 | false, 354 | "" 355 | ); 356 | } 357 | 358 | public function testSaveVcfNoClusterQcOnlyFalseQcFilterTrue() 359 | { 360 | $this->runVcfQcTest( 361 | "tests/output/vcf_qc/qc_only_F_qc_filter_T.vcf", 362 | false, 363 | true, 364 | "" 365 | ); 366 | } 367 | 368 | public function testSaveVcfNoClusterQcOnlyTrueQcFilterFalse() 369 | { 370 | $this->runVcfQcTest( 371 | "tests/output/vcf_qc/qc_only_T_qc_filter_F.vcf", 372 | true, 373 | false, 374 | "" 375 | ); 376 | } 377 | 378 | public function testSaveVcfNoClusterQcOnlyTrueQcFilterTrue() 379 | { 380 | $this->runVcfQcTest( 381 | "tests/output/vcf_qc/qc_only_T_qc_filter_T.vcf", 382 | true, 383 | true, 384 | "" 385 | ); 386 | } 387 | } 388 | -------------------------------------------------------------------------------- /src/Resources.php: -------------------------------------------------------------------------------- 1 | 9 | * @copyright Copyright (c) 2020-2023, Liberu Software Ltd 10 | * @license MIT 11 | * 12 | * @link http://github.com/laravel-liberu/php-dna 13 | */ 14 | 15 | namespace Dna; 16 | 17 | use Exception; 18 | 19 | /** 20 | * Class Resources. 21 | */ 22 | class Resources extends \Dna\Snps\SNPsResources { 23 | 24 | protected $_genetic_map = '{}'; 25 | protected $_genetic_map_name = ''; 26 | protected $_cytoBand_hg19 = []; 27 | protected $_knownGene_hg19 = []; 28 | protected $_kgXref_hg19 = []; 29 | 30 | public function __construct($resources_dir = 'resources') 31 | { 32 | parent::__construct($resources_dir = $resources_dir); 33 | } 34 | 35 | 36 | // { 37 | // // Check if the current genetic map is already HapMap2 38 | // if ($this->_genetic_map_name !== "HapMap2") { 39 | // // If not already HapMap2, load the HapMap2 genetic map and set it as the current genetic map 40 | // $this->_genetic_map = $this->_load_genetic_map_HapMapII_GRCh37( 41 | // $this->_get_path_genetic_map_HapMapII_GRCh37() 42 | // ); 43 | // $this->_genetic_map_name = "HapMap2"; 44 | // } 45 | 46 | // // Return the HapMap2 genetic map in GRCh37 format 47 | // return $this->_genetic_map; 48 | // } 49 | 50 | // /** 51 | // * Returns the genetic map for a given population in the 1000 Genomes Project GRCh37 reference genome. 52 | // * 53 | // * @param string $pop The population code (e.g. "CEU", "YRI", "CHB") for which to retrieve the genetic map. 54 | // * @return array The genetic map for the specified population. 55 | // */ 56 | // public function get_genetic_map_1000G_GRCh37(string $pop): array 57 | // { 58 | // // Check if the requested genetic map is already loaded 59 | // if ($this->_genetic_map_name !== $pop) { 60 | // // If not, load the genetic map from file 61 | // $this->_genetic_map = $this->_load_genetic_map_1000G_GRCh37( 62 | // $this->_get_path_genetic_map_1000G_GRCh37(pop: $pop) 63 | // ); 64 | // // Update the name of the loaded genetic map 65 | // $this->_genetic_map_name = $pop; 66 | // } 67 | 68 | // // Return the loaded genetic map 69 | // return $this->_genetic_map; 70 | // } 71 | 72 | /** 73 | * Returns the cytogenetic banding information for the hg19 reference genome. 74 | * 75 | * @return array The cytogenetic banding information for hg19. 76 | */ 77 | public function getCytoBandHg19(): array 78 | { 79 | // Check if the cytogenetic banding information for hg19 is already loaded 80 | if (empty($this->_cytoBand_hg19)) { 81 | // If not, load the cytogenetic banding information from file 82 | $this->_cytoBand_hg19 = $this->loadCytoBand($this->getPathCytoBandHg19()); 83 | } 84 | 85 | return $this->_cytoBand_hg19; 86 | } 87 | 88 | // /** 89 | // * Returns the knownGene_hg19 data. 90 | // * 91 | // * @return array The knownGene_hg19 data. 92 | // */ 93 | // public function get_knownGene_hg19() { 94 | // // Check if the _knownGene_hg19 property is empty. 95 | // if ($this->_knownGene_hg19->empty()) { 96 | // // If it is empty, load the knownGene_hg19 data from the file path. 97 | // $this->_knownGene_hg19 = $this->_load_knownGene( 98 | // $this->_get_path_knownGene_hg19() 99 | // ); 100 | // } 101 | // // Return the knownGene_hg19 data. 102 | // return $this->_knownGene_hg19; 103 | // } 104 | 105 | /** 106 | * Returns the kgXref data for the hg19 reference genome. 107 | * 108 | * @return array The kgXref data for hg19. 109 | */ 110 | public function getKgXrefHg19(): array 111 | { 112 | // Check if the _kgXref_hg19 property is empty. 113 | if (empty($this->_kgXref_hg19)) { 114 | // If it is empty, load the kgXref_hg19 data from the file path. 115 | $this->_kgXref_hg19 = $this->loadKgXref( 116 | $this->getPathKgXrefHg19() 117 | ); 118 | } 119 | 120 | return $this->_kgXref_hg19; 121 | } 122 | 123 | // public function _load_genetic_map_HapMapII_GRCh37($filename) 124 | // { 125 | // $genetic_map = array( ); 126 | // $archive = new PharData($filename); 127 | // foreach($archive as $file) { 128 | // if (strpos("genetic_map",$file["name"])===true){ 129 | // $df = array( ); 130 | // if (($handle = fopen($file, "r")) !== FALSE) { 131 | // while (($data = fgetcsv($handle,"\t")) !== FALSE) { 132 | // $df["Position(bp)"]=$data["pos"]; 133 | // $df["Rate(cM/Mb)"]=$data["rate"]; 134 | // $df["Map(cM)"]=$data["map"]; 135 | // } 136 | // fclose($handle); 137 | // } 138 | // $start_pos = strpos($file["name"],"chr") + 3; 139 | // $end_pos = strpos($file["name"],"."); 140 | // $genetic_map[substr($file["name"],$start_pos,$end_pos)] = $df; 141 | // } 142 | // } 143 | // $genetic_map["X"] = array_merge( 144 | // $genetic_map["X_par1"], $genetic_map["X"], $genetic_map["X_par2"] 145 | // ); 146 | // $genetic_map["X_par1"]=array( ); 147 | // $genetic_map["X_par2"]=array( ); 148 | // return $genetic_map; 149 | // } 150 | 151 | /** 152 | * Loads a genetic map from a file in the 1000 Genomes Project format (GRCh37). 153 | * 154 | * @param string $filename The path to the file to load. 155 | * @return array An associative array of genetic maps, keyed by chromosome. 156 | */ 157 | function loadGeneticMap1000GGRCh37($filename) 158 | { 159 | $geneticMap = []; // Initialize an empty array to hold the genetic maps. 160 | 161 | $phar = new PharData($filename); // Create a new PharData object from the file. 162 | 163 | foreach ($phar as $member) { // Loop through each file in the Phar archive. 164 | $filepath = $member->getPathname(); // Get the path to the file. 165 | $geneticMap = $this->processGeneticMapFile($filepath, $geneticMap); 166 | } 167 | 168 | return $geneticMap; // Return the $geneticMap array. 169 | } 170 | 171 | /** 172 | * Processes a single genetic map file and adds the data to the $geneticMap array. 173 | * 174 | * @param string $filepath The path to the genetic map file. 175 | * @param array $geneticMap The array to add the genetic map data to. 176 | * @return array The updated $geneticMap array. 177 | */ 178 | function processGeneticMapFile($filepath, $geneticMap) 179 | { 180 | $file = gzopen($filepath, 'r'); // Open the file for reading. 181 | $header = fgetcsv($file, 0, "\t"); // Read the header row of the CSV file. 182 | 183 | $tempFile = []; // Initialize an empty array to hold the data rows. 184 | while (($data = fgetcsv($file, 0, "\t")) !== false) { // Loop through each row of the CSV file. 185 | if (count($data) == count($header)) { // Check that the row has the same number of columns as the header. 186 | $tempFile[] = array_combine($header, $data); // Combine the header and data rows into an associative array. 187 | } 188 | } 189 | 190 | $df = []; // Initialize an empty array to hold the genetic map data. 191 | foreach ($tempFile as $row) { // Loop through each row of the data. 192 | $df[] = [ // Add a new array to the $df array. 193 | "pos" => $row["Position(bp)"], // Add the position to the array. 194 | "rate" => $row["Rate(cM/Mb)"], // Add the rate to the array. 195 | "map" => $row["Map(cM)"], // Add the map to the array. 196 | ]; 197 | } 198 | 199 | $chrom = explode("-", basename($filepath))[1]; // Get the chromosome number from the filename. 200 | $geneticMap[$chrom] = $df; // Add the genetic map data to the $geneticMap array, keyed by chromosome. 201 | 202 | return $geneticMap; 203 | } 204 | 205 | // // public function downloadFile($url, $filename, $compress=False, $timeout=30) 206 | // // { 207 | // // if(strpos($url, "ftp://") !== false) { 208 | // // $url=str_replace($url,"ftp://", "http://"); 209 | // // } 210 | // // if ($compress && substr($filename,strlen($filename)-3,strlen($filename)) != ".gz"){ 211 | // // $filename = $filename+".gz"; 212 | // // } 213 | // // $destination = join($this->resources_dir, $filename); 214 | 215 | // // if (!mkdir($destination)){ 216 | // // return ""; 217 | // // } 218 | // // if (file_exists($destination)) { 219 | // // $file_url = $destination; 220 | // // header('Content-Type: application/octet-stream'); 221 | // // header('Content-Description: File Transfer'); 222 | // // header('Content-Disposition: attachment; filename=' . $filename); 223 | // // header('Expires: 0'); 224 | // // header('Cache-Control: must-revalidate'); 225 | // // header('Pragma: public'); 226 | // // header('Content-Length: ' . filesize($file_url)); 227 | // // readfile($file_url); 228 | 229 | // // // if $compress 230 | // // // $this->_write_data_to_gzip(f, data) 231 | // // // else 232 | // // // f.write(data) 233 | // // } 234 | // // return $destination; 235 | // // } 236 | 237 | // /** 238 | // * Load UCSC knownGene table. 239 | // * 240 | // * @param string $filename Path to knownGene file 241 | // * 242 | // * @return array KnownGene table (associative array) 243 | // */ 244 | // public static function loadKnownGene(string $filename): array 245 | // { 246 | // $file = fopen($filename, 'r'); 247 | // $headers = [ 248 | // 'name', 249 | // 'chrom', 250 | // 'strand', 251 | // 'txStart', 252 | // 'txEnd', 253 | // 'cdsStart', 254 | // 'cdsEnd', 255 | // 'exonCount', 256 | // 'exonStarts', 257 | // 'exonEnds', 258 | // 'proteinID', 259 | // 'alignID', 260 | // ]; 261 | // $knownGene = []; 262 | 263 | // while (($row = fgetcsv($file, 0, "\t")) !== false) { 264 | // $rowData = array_combine($headers, $row); 265 | // $rowData['chrom'] = substr($rowData['chrom'], 3); 266 | // $knownGene[$rowData['name']] = $rowData; 267 | // } 268 | 269 | // fclose($file); 270 | 271 | // return $knownGene; 272 | // } 273 | 274 | // /** 275 | // * Load UCSC kgXref table. 276 | // * 277 | // * @param string $filename Path to kgXref file 278 | // * 279 | // * @return array kgXref table (associative array) 280 | // */ 281 | // public static function loadKgXref(string $filename): array 282 | // { 283 | // $file = fopen($filename, 'r'); 284 | // $headers = [ 285 | // 'kgID', 286 | // 'mRNA', 287 | // 'spID', 288 | // 'spDisplayID', 289 | // 'geneSymbol', 290 | // 'refseq', 291 | // 'protAcc', 292 | // 'description', 293 | // 'rfamAcc', 294 | // 'tRnaName', 295 | // ]; 296 | // $kgXref = []; 297 | 298 | // while (($row = fgetcsv($file, 0, "\t")) !== false) { 299 | // $rowData = array_combine($headers, $row); 300 | // $kgXref[$rowData['kgID']] = $rowData; 301 | // } 302 | 303 | // fclose($file); 304 | 305 | // return $kgXref; 306 | // } 307 | 308 | /** 309 | * Get local path to cytoBand file for hg19 / GRCh37 from UCSC, downloading if necessary. 310 | * 311 | * @return string Path to cytoBand_hg19.txt.gz 312 | */ 313 | public function getPathCytoBandHg19(): string 314 | { 315 | return $this->downloadFile( 316 | 'ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz', 317 | 'cytoBand_hg19.txt.gz' 318 | ); 319 | } 320 | 321 | // /** 322 | // * Download file from a given URL if not exists and return its local path. 323 | // * 324 | // * @param string $url URL of the file to download 325 | // * @param string $filename Local name for the downloaded file 326 | // * 327 | // * @return string Local path to the downloaded file 328 | // */ 329 | // protected function downloadFile(string $url, string $filename): string 330 | // { 331 | // $path = __DIR__ . '/' . $filename; 332 | // if (!file_exists($path)) { 333 | // $parsedUrl = parse_url($url); 334 | // $host = $parsedUrl['host']; 335 | // $remotePath = $parsedUrl['path']; 336 | 337 | // $conn = ftp_connect($host); 338 | // if ($conn) { 339 | // $loggedIn = ftp_login($conn, 'anonymous', ''); 340 | // if ($loggedIn) { 341 | // ftp_pasv($conn, true); 342 | // $downloaded = ftp_get($conn, $path, $remotePath, FTP_BINARY); 343 | // if (!$downloaded) { 344 | // throw new Exception("Failed to download the file '{$url}'."); 345 | // } 346 | // ftp_close($conn); 347 | // } else { 348 | // throw new Exception("Failed to log in to the FTP server '{$host}'."); 349 | // } 350 | // } else { 351 | // throw new Exception("Failed to connect to the FTP server '{$host}'."); 352 | // } 353 | // } 354 | 355 | // return $path; 356 | // } 357 | 358 | // /** 359 | // * Get local path to HapMap Phase II genetic map for hg19 / GRCh37 (HapMapII), downloading if necessary 360 | // * 361 | // * @return string Path to genetic_map_HapMapII_GRCh37.tar.gz 362 | // */ 363 | // public function getPathGeneticMapHapMapIIGRCh37(): string 364 | // { 365 | // return $this->downloadFile( 366 | // 'ftp://ftp.ncbi.nlm.nih.gov/hapmap/recombination/2011-01_phaseII_B37/genetic_map_HapMapII_GRCh37.tar.gz', 367 | // 'genetic_map_HapMapII_GRCh37.tar.gz' 368 | // ); 369 | // } 370 | 371 | // /** 372 | // * Get local path to population-specific 1000 Genomes Project genetic map, 373 | // * downloading if necessary. 374 | // * 375 | // * @param string $pop 376 | // * @return string path to {pop}_omni_recombination_20130507.tar 377 | // */ 378 | // public function getGeneticMap1000G_GRCh37($pop) 379 | // { 380 | // $filename = "{$pop}_omni_recombination_20130507.tar"; 381 | // return $this->downloadFile( 382 | // "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130507_omni_recombination_rates/{$filename}", 383 | // $filename 384 | // ); 385 | // } 386 | 387 | // /** 388 | // * Downloads the knownGene.txt.gz file for the hg19 genome assembly from the UCSC Genome Browser FTP server. 389 | // * 390 | // * @return string The path to the downloaded file. 391 | // */ 392 | // public function get_path_knownGene_hg19(): string { 393 | // // Download the file from the UCSC Genome Browser FTP server. 394 | // // The file is located at ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.txt.gz 395 | // // and will be saved as knownGene_hg19.txt.gz in the current directory. 396 | // return $this->download_file( 397 | // "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.txt.gz", 398 | // "knownGene_hg19.txt.gz" 399 | // ); 400 | // } 401 | 402 | // /** 403 | // * Downloads the kgXref.txt.gz file for the hg19 genome assembly from the UCSC Genome Browser FTP server. 404 | // * 405 | // * @return string The path to the downloaded file. 406 | // */ 407 | // public function get_path_kgXref_hg19(): string { 408 | // // Download the file from the UCSC Genome Browser FTP server. 409 | // // The file is located at ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/kgXref.txt.gz 410 | // // and will be saved as kgXref_hg19.txt.gz in the current directory. 411 | // return $this->download_file( 412 | // "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/kgXref.txt.gz", 413 | // "kgXref_hg19.txt.gz" 414 | // ); 415 | // } 416 | 417 | 418 | 419 | // public function download_example_datasets() : array 420 | // { 421 | // return [ 422 | // $this->downloadFile( 423 | // "https://opensnp.org/data/662.23andme.340", 424 | // "662.23andme.340.txt.gz", 425 | // $compress=True 426 | // ), 427 | // $this->downloadFile( 428 | // "https://opensnp.org/data/662.ftdna-illumina.341", 429 | // "662.ftdna-illumina.341.csv.gz", 430 | // $compress=True 431 | // ), 432 | // $this->downloadFile( 433 | // "https://opensnp.org/data/663.23andme.305", 434 | // "663.23andme.305.txt.gz", 435 | // $compress=True 436 | // ), 437 | // $this->downloadFile( 438 | // "https://opensnp.org/data/4583.ftdna-illumina.3482", 439 | // "4583.ftdna-illumina.3482.csv.gz" 440 | // ), 441 | // $this->downloadFile( 442 | // "https://opensnp.org/data/4584.ftdna-illumina.3483", 443 | // "4584.ftdna-illumina.3483.csv.gz" 444 | // ), 445 | // ]; 446 | // } 447 | } 448 | 449 | ?> -------------------------------------------------------------------------------- /tests/Snps/SnpsMergeTest.php: -------------------------------------------------------------------------------- 1 | assertEquals(count($results), count($expectedResults)); 17 | 18 | foreach ($results as $i => $result) { 19 | $expectedResult = $expectedResults[$i]; 20 | 21 | $this->assertEquals( 22 | [ 23 | "common_rsids", 24 | "discrepant_genotype_rsids", 25 | "discrepant_position_rsids", 26 | "merged", 27 | ], 28 | sort(array_keys($result)) 29 | ); 30 | 31 | if (array_key_exists("merged", $expectedResult)) { 32 | if ($expectedResult["merged"]) { 33 | $this->assertTrue($result["merged"]); 34 | } else { 35 | $this->assertFalse($result["merged"]); 36 | } 37 | } else { 38 | $this->assertFalse($result["merged"]); 39 | } 40 | 41 | foreach (["common_rsids", "discrepant_position_rsids", "discrepant_genotype_rsids"] as $key) { 42 | if (array_key_exists($key, $expectedResult)) { 43 | $this->assertEquals( 44 | $result[$key], 45 | $expectedResult[$key], 46 | true, 47 | true 48 | ); 49 | } else { 50 | $this->assertTrue($result[$key]->isEmpty()); 51 | $this->assertEquals($result[$key]->getName(), "rsid"); 52 | } 53 | } 54 | } 55 | } 56 | 57 | public function testSourceSNPs() 58 | { 59 | $tmpdir = sys_get_temp_dir(); 60 | 61 | $initial = new SNPs("tests/input/GRCh37.csv", output_dir: $tmpdir); 62 | $this->assertEquals($initial->getSource(), "generic"); 63 | $initial->merge([new SNPs("tests/input/23andme.txt")]); 64 | 65 | $this->assertEquals($initial->getSource(), "generic, 23andMe"); 66 | 67 | $this->assertEquals($initial->getAllSources(), ["generic", "23andMe"]); 68 | $mergedFile = $tmpdir . "/generic__23andMe_GRCh37.txt"; 69 | $this->assertEquals($initial->toTsv(), $mergedFile); 70 | 71 | $fromFile = new SNPs($mergedFile); 72 | 73 | 74 | $this->assertEquals($initial->getSnps(), $fromFile->getSnps()); 75 | $this->assertResults($fromFile, [["merged" => true]]); 76 | } 77 | 78 | public function testMergeList() 79 | { 80 | $s = new SNPs(); 81 | $results = $s->merge([new SNPs("tests/input/GRCh37.csv"), new SNPs("tests/input/GRCh37.csv")]); 82 | $this->assertEquals($s->getSnps(), self::snps_GRCh37()); 83 | $this->assertEquals($s->getSource(), "generic, generic"); 84 | $this->assertEquals($s->getAllSources(), ["generic", "generic"]); 85 | 86 | $expectedResults = [ 87 | ["merged" => true], 88 | [ 89 | "merged" => true, 90 | "common_rsids" => [ 91 | "rs3094315", 92 | "rs2500347", 93 | "rsIndelTest", 94 | "rs11928389", 95 | ], 96 | ], 97 | ]; 98 | $this->assertResults($results, $expectedResults); 99 | } 100 | 101 | public function testMergeRemapping() 102 | { 103 | $s = new SNPs("tests/input/NCBI36.csv"); 104 | 105 | $results = $s->merge([new SNPs("tests/input/GRCh37.csv")]); 106 | 107 | // Check that there are no discrepancies in merge positions and genotypes 108 | $this->assertCount(0, $s->getDiscrepantMergePositions()); 109 | $this->assertCount(0, $s->getDiscrepantMergeGenotypes()); 110 | 111 | // Compare the 'snps' attribute of 's' with the expected array directly 112 | $this->assertEquals($s->getSnps(), self::snps_NCBI36()); 113 | 114 | // Check the results of the merge operation 115 | $expectedResults = [ 116 | [ 117 | "merged" => true, 118 | "common_rsids" => [ 119 | "rs3094315", 120 | "rs2500347", 121 | "rsIndelTest", 122 | "rs11928389", 123 | ], 124 | ], 125 | ]; 126 | $this->assertResults($results, $expectedResults); 127 | } 128 | 129 | public function testMergeRemapFalse() 130 | { 131 | $s = new SNPs("tests/input/NCBI36.csv"); 132 | 133 | $results = $s->merge([new SNPs("tests/input/GRCh37.csv")], false); 134 | 135 | // Check the count of discrepancies in merge positions 136 | $this->assertCount(4, $s->getDiscrepantMergePositions()); 137 | // Compare the discrepancies in merge positions with the expected results 138 | $this->assertSame( 139 | $s->getDiscrepantMergePositions(), 140 | $results[0]["discrepant_position_rsids"] 141 | ); 142 | 143 | // Check the count of discrepancies in merge genotypes 144 | $this->assertCount(1, $s->getDiscrepantMergeGenotypes()); 145 | // Compare the discrepancies in merge genotypes with the expected results 146 | $this->assertSame( 147 | $s->getDiscrepantMergeGenotypes(), 148 | $results[0]["discrepant_genotype_rsids"] 149 | ); 150 | 151 | // Check the count of discrepancies in both positions and genotypes 152 | $this->assertCount(4, $s->getDiscrepantMergePositionsGenotypes()); 153 | // Compare the discrepancies in both positions and genotypes with the expected results 154 | $this->assertSame( 155 | $s->getDiscrepantMergePositionsGenotypes(), 156 | $results[0]["discrepant_position_rsids"] 157 | ); 158 | 159 | // Define the expected array for snps_NCBI36 with the discrepant genotype set to null/NA 160 | $expected = self::snps_NCBI36(); 161 | $expected["rs11928389"]["genotype"] = null; 162 | 163 | // Compare the 'snps' attribute of 's' with the expected array directly 164 | $this->assertEquals($s->getSnps(), $expected); 165 | 166 | // Check the results of the merge operation 167 | $expectedResults = [ 168 | [ 169 | "merged" => true, 170 | "common_rsids" => [ 171 | "rs3094315", 172 | "rs2500347", 173 | "rsIndelTest", 174 | "rs11928389", 175 | ], 176 | "discrepant_position_rsids" => [ 177 | "rs3094315", 178 | "rs2500347", 179 | "rsIndelTest", 180 | "rs11928389", 181 | ], 182 | "discrepant_genotype_rsids" => ["rs11928389"], 183 | ], 184 | ]; 185 | $this->assertResults($results, $expectedResults); 186 | } 187 | 188 | 189 | public function testMergePhased() 190 | { 191 | $s1 = new SNPs("tests/input/generic.csv"); 192 | $s2 = new SNPs("tests/input/generic.csv"); 193 | $s1->setPhased(true); 194 | $s2->setPhased(true); 195 | 196 | $results = $s1->merge([$s2]); 197 | 198 | // Check if 's1' is marked as phased 199 | $this->assertTrue($s1->isPhased()); 200 | 201 | // Compare the 'snps' attribute of 's1' with the expected array directly 202 | $this->assertEquals($s1->getSnps(), self::genericSnps()); 203 | 204 | // Check the results of the merge operation 205 | $expectedResults = [ 206 | [ 207 | "merged" => true, 208 | "common_rsids" => [ 209 | "rs1", "rs2", "rs3", "rs4", 210 | "rs5", "rs6", "rs7", "rs8" 211 | ], 212 | ], 213 | ]; 214 | $this->assertResults($results, $expectedResults); 215 | } 216 | 217 | public function testMergeUnphased() 218 | { 219 | $s1 = new SNPs("tests/input/generic.csv"); 220 | $s2 = new SNPs("tests/input/generic.csv"); 221 | $s1->setPhased(true); 222 | 223 | $results = $s1->merge([$s2]); 224 | 225 | // Check if 's1' is marked as unphased (not phased) 226 | $this->assertFalse($s1->isPhased()); 227 | 228 | // Compare the 'snps' attribute of 's1' with the expected array directly 229 | $this->assertEquals($s1->getSnps(), self::genericSnps()); 230 | 231 | // Check the results of the merge operation 232 | $expectedResults = [ 233 | [ 234 | "merged" => true, 235 | "common_rsids" => [ 236 | "rs1", "rs2", "rs3", "rs4", 237 | "rs5", "rs6", "rs7", "rs8" 238 | ], 239 | ], 240 | ]; 241 | $this->assertResults($results, $expectedResults); 242 | } 243 | 244 | public function testMergeNonExistentFile() 245 | { 246 | $s = new SNPs(); 247 | $results = $s->merge([ 248 | new SNPs("tests/input/non_existent_file.csv"), 249 | new SNPs("tests/input/GRCh37.csv") 250 | ]); 251 | 252 | // Compare the 'snps' attribute of 's' with the expected array directly 253 | $this->assertEquals($s->getSnps(), self::snps_GRCh37()); 254 | 255 | // Check the results of the merge operation 256 | $expectedResults = [ 257 | [], // No merge for the non-existent file 258 | ["merged" => true], 259 | ]; 260 | $this->assertResults($results, $expectedResults); 261 | } 262 | 263 | public function testMergeInvalidFile() 264 | { 265 | $s = new SNPs(); 266 | $results = $s->merge([ 267 | new SNPs("tests/input/GRCh37.csv"), 268 | new SNPs("tests/input/empty.txt") 269 | ]); 270 | 271 | // Compare the 'snps' attribute of 's' with the expected array directly 272 | $this->assertEquals($s->getSnps(), self::snps_GRCh37()); 273 | 274 | // Check the results of the merge operation 275 | $expectedResults = [ 276 | ["merged" => true], // Merge with the valid file 277 | [], // No merge for the invalid file 278 | ]; 279 | $this->assertResults($results, $expectedResults); 280 | } 281 | 282 | public function testMergeExceedDiscrepantPositionsThreshold() 283 | { 284 | $s1 = new SNPs("tests/input/generic.csv"); 285 | $s2 = new SNPs("tests/input/generic.csv"); 286 | $s2->getSnps()["rs1"]["pos"] = 100; 287 | 288 | $results = $s1->merge([$s2], ["discrepant_positions_threshold" => 0]); 289 | $this->assertCount(0, $s1->getDiscrepantMergePositions()); 290 | $this->assertCount(0, $s1->getDiscrepantMergeGenotypes()); 291 | $this->assertCount(0, $s1->getDiscrepantMergePositionsGenotypes()); 292 | 293 | // Compare the 'snps' attribute of 's1' with the expected array directly 294 | $this->assertEquals($s1->getSnps(), self::genericSnps()); 295 | 296 | // Check the results of the merge operation 297 | $expectedResults = [[]]; 298 | $this->assertResults($results, $expectedResults); 299 | } 300 | 301 | public function testMergeExceedDiscrepantGenotypesThreshold() 302 | { 303 | $s1 = new SNPs("tests/input/generic.csv"); 304 | $s2 = new SNPs("tests/input/generic.csv"); 305 | $s2->getSnps()["rs1"]["genotype"] = "CC"; 306 | 307 | $results = $s1->merge([$s2], ["discrepant_genotypes_threshold" => 0]); 308 | $this->assertCount(0, $s1->getDiscrepantMergePositions()); 309 | $this->assertCount(0, $s1->getDiscrepantMergeGenotypes()); 310 | $this->assertCount(0, $s1->getDiscrepantMergePositionsGenotypes()); 311 | 312 | // Compare the 'snps' attribute of 's1' with the expected array directly 313 | $this->assertEquals($s1->getSnps(), self::genericSnps()); 314 | 315 | // Check the results of the merge operation 316 | $expectedResults = [[]]; 317 | $this->assertResults($results, $expectedResults); 318 | } 319 | 320 | public function testMergingFilesDiscrepantSnps() 321 | { 322 | $tmpDir = sys_get_temp_dir(); 323 | $dest1 = $tmpDir . "/discrepant_snps1.csv"; 324 | $dest2 = $tmpDir . "/discrepant_snps2.csv"; 325 | 326 | // Read the CSV file 327 | $csv = Reader::createFromPath("tests/input/discrepant_snps.csv", "r"); 328 | $csv->setHeaderOffset(1); 329 | $records = $csv->getRecords(); 330 | 331 | // Create arrays for the first and second CSV files 332 | $file1Data = []; 333 | $file2Data = []; 334 | foreach ($records as $record) { 335 | $file1Data[] = [ 336 | "chromosome" => $record["chrom"], 337 | "position" => $record["pos_file1"], 338 | "genotype" => $record["genotype_file1"], 339 | ]; 340 | $file2Data[] = [ 341 | "chromosome" => $record["chrom"], 342 | "position" => $record["pos_file2"], 343 | "genotype" => $record["genotype_file2"], 344 | ]; 345 | } 346 | 347 | // Write arrays to CSV files 348 | $file1Writer = Writer::createFromPath($dest1, "w"); 349 | $file1Writer->insertOne(["chromosome", "position", "genotype"]); 350 | $file1Writer->insertAll($file1Data); 351 | 352 | $file2Writer = Writer::createFromPath($dest2, "w"); 353 | $file2Writer->insertOne(["chromosome", "position", "genotype"]); 354 | $file2Writer->insertAll($file2Data); 355 | 356 | $s = new SNPs(); 357 | $s->merge([new SNPs($dest1), new SNPs($dest2)]); 358 | 359 | // Expected data 360 | $expected = []; 361 | foreach ($records as $record) { 362 | $expected[] = [ 363 | "chromosome" => $record["chrom"], 364 | "discrepant_position" => $record["discrepant_position"], 365 | "discrepant_genotype" => $record["discrepant_genotype"], 366 | "pos" => $record["expected_position"], 367 | "genotype" => $record["expected_genotype"], 368 | ]; 369 | } 370 | 371 | // Create an SNPs object from the expected data 372 | $expectedSNPs = new SNPs(); 373 | $expectedSNPs->setSnps($expected); 374 | $expectedSNPs->sort(); 375 | $expected = $expectedSNPs->getSnps(); 376 | 377 | // Assert results 378 | $this->assertCount(count($expected), $s->getDiscrepantMergePositions()); 379 | $this->assertCount(count($expected), $s->getDiscrepantMergeGenotypes()); 380 | $this->assertArrayHasKey("pos", $s->getSnps()); 381 | $this->assertArrayHasKey("genotype", $s->getSnps()); 382 | 383 | // Perform comparisons 384 | foreach ($expected as $key => $value) { 385 | $this->assertEquals($value["discrepant_position"], $s->getDiscrepantMergePositions()[$key]); 386 | $this->assertEquals($value["discrepant_genotype"], $s->getDiscrepantMergeGenotypes()[$key]); 387 | $this->assertEquals($value["pos"], $s->getSnps()[$key]["pos"]); 388 | $this->assertEquals($value["genotype"], $s->getSnps()[$key]["genotype"]); 389 | } 390 | } 391 | 392 | public function testAppendingDfs() 393 | { 394 | $s = new SNPs(); 395 | $s->setSnps([ 396 | ["rsid" => "rs1", "chrom" => "1", "pos" => 1, "genotype" => "AA"], 397 | ]); 398 | $s->setDuplicate([ 399 | ["rsid" => "rs1", "chrom" => "1", "pos" => 1, "genotype" => "AA"], 400 | ]); 401 | $s->setDiscrepantXY([ 402 | ["rsid" => "rs1", "chrom" => "1", "pos" => 1, "genotype" => "AA"], 403 | ]); 404 | 405 | $s->merge([$s]); 406 | 407 | $df = [ 408 | ["rsid" => "rs1", "chrom" => "1", "pos" => 1, "genotype" => "AA"], 409 | ["rsid" => "rs1", "chrom" => "1", "pos" => 1, "genotype" => "AA"], 410 | ]; 411 | 412 | $this->assertEquals($df, $s->getDuplicate()); 413 | $this->assertEquals($df, $s->getDiscrepantXY()); 414 | $this->assertEquals([], $s->getHeterozygousMT()); 415 | $this->assertEquals([], $s->getDiscrepantVcfPosition()); 416 | } 417 | 418 | public function testMergeChrom() 419 | { 420 | $s1 = new SNPs("tests/input/generic.csv"); 421 | $s2 = new SNPs(); 422 | $s2->setBuild(37); 423 | 424 | $snpData = [ 425 | ["rsid" => "rs100", "chrom" => "Y", "pos" => 100, "genotype" => "A"], 426 | ["rsid" => "rs101", "chrom" => "Y", "pos" => 101, "genotype" => null], 427 | ["rsid" => "rs102", "chrom" => "Y", "pos" => 102, "genotype" => "A"], 428 | ["rsid" => "rs103", "chrom" => "Y", "pos" => 103, "genotype" => "A"], 429 | ]; 430 | 431 | $s1->setSnps(array_merge($s1->getSnps(), $snpData)); 432 | $s2->setSnps(array_merge($s2->getSnps(), $snpData)); 433 | 434 | // Set values for chrom that will be ignored 435 | $s2->setSnpsValue("rs3", "pos", 1003); // Discrepant position 436 | $s2->setSnpsValue("rs4", "genotype", "AA"); // Discrepant genotype 437 | $s2->setSnpsValue("rs5", "genotype", "AA"); 438 | 439 | // Set values for chrom to be merged 440 | $s2->setSnpsValue("rs100", "genotype", "T"); // Discrepant genotype 441 | $s2->setSnpsValue("rs101", "genotype", "A"); 442 | $s2->setSnpsValue("rs102", "pos", 1002); // Discrepant position 443 | 444 | // Set expected values for merge result 445 | $s1->setSnpsValue("rs100", "genotype", null); // Discrepant genotype sets to null 446 | $s1->setSnpsValue("rs101", "genotype", "A"); // Updates null 447 | 448 | $results = $s1->merge([$s2], "Y"); 449 | 450 | $this->assertEquals($s1->getSnps(), $s1->getSnps()); 451 | 452 | $expectedResults = [ 453 | [ 454 | "merged" => true, 455 | "common_rsids" => ["rs100", "rs101", "rs102", "rs103"], 456 | "discrepant_position_rsids" => ["rs102"], 457 | "discrepant_genotype_rsids" => ["rs100"], 458 | ] 459 | ]; 460 | 461 | $this->assertEquals($expectedResults, $results); 462 | 463 | $this->assertEquals(count($s1->getDiscrepantMergePositions()), 1); 464 | $this->assertEquals(count($s1->getDiscrepantMergeGenotypes()), 1); 465 | } 466 | } 467 | -------------------------------------------------------------------------------- /tests/Snps/SnpsTest.php: -------------------------------------------------------------------------------- 1 | assertEquals(count($s), 8); 30 | } 31 | 32 | public function test_len_empty() 33 | { 34 | foreach (self::empty_snps() as $s) { 35 | $this->assertEquals(count($s), 0); 36 | } 37 | } 38 | 39 | public function test__toString() 40 | { 41 | $s = new SNPs("tests/input/GRCh37.csv"); 42 | $this->assertEquals("SNPs('GRCh37.csv')", $s->__toString()); 43 | } 44 | 45 | public function test__toString_bytes() 46 | { 47 | $data = file_get_contents("tests/input/GRCh37.csv"); 48 | $s = new SNPs($data); 49 | $this->assertEquals("SNPs()", $s->__toString()); 50 | } 51 | 52 | public function testAssembly() 53 | { 54 | $s = new SNPs("tests/input/GRCh38.csv"); 55 | $this->assertEquals($s->getAssembly(), "GRCh38"); 56 | } 57 | 58 | public function testAssemblyNoSnps() 59 | { 60 | $emptySnps = $this->empty_snps(); 61 | 62 | foreach ($emptySnps as $snps) { 63 | $this->assertEmpty($snps->getAssembly()); 64 | } 65 | } 66 | 67 | public function testBuild() 68 | { 69 | $s = new SNPs("tests/input/NCBI36.csv"); 70 | $this->assertEquals($s->getBuild(), 36); 71 | $this->assertEquals($s->getAssembly(), "NCBI36"); 72 | } 73 | 74 | public function testBuildDetectedNoSnps() 75 | { 76 | $emptySnps = $this->empty_snps(); 77 | 78 | foreach ($emptySnps as $snps) { 79 | $this->assertFalse($snps->isBuildDetected()); 80 | } 81 | } 82 | 83 | public function testBuildNoSnps() 84 | { 85 | $emptySnps = $this->empty_snps(); 86 | 87 | foreach ($emptySnps as $snps) { 88 | $this->assertEmpty($snps->getBuild()); 89 | } 90 | } 91 | 92 | public function testBuildDetectedPARSnps() 93 | { 94 | $snps = $this->loadAssignPARSnps('tests/input/GRCh37_PAR.csv'); 95 | $this->assertEquals(37, $snps->getBuild()); 96 | $this->assertTrue($snps->isBuildDetected()); 97 | $expectedSnps = $this->snps_GRCh37_PAR(); 98 | $actualSnps = $snps->getSnps(); 99 | $this->assertEquals($expectedSnps, $actualSnps); 100 | } 101 | 102 | public function test_notnull() 103 | { 104 | $s = new SNPs("tests/input/generic.csv"); 105 | $snps = $this->generic_snps(); 106 | unset($snps["rs5"]); 107 | 108 | $this->assertEquals($s->notnull(), $snps, "Frames are not equal!"); 109 | } 110 | 111 | public function test_heterozygous() 112 | { 113 | $s = new SNPs("tests/input/generic.csv"); 114 | 115 | $expected = $this->create_snp_df( 116 | rsid: ["rs6", "rs7", "rs8"], 117 | chrom: ["1", "1", "1"], 118 | pos: [106, 107, 108], 119 | genotype: ["GC", "TC", "AT"] 120 | ); 121 | 122 | $this->assertEquals($expected, $s->heterozygous(), "Frames are not equal!"); 123 | } 124 | 125 | public function test_homozygous() 126 | { 127 | $s = new SNPs("tests/input/generic.csv"); 128 | 129 | $expected = $this->create_snp_df( 130 | rsid: ["rs1", "rs2", "rs3", "rs4"], 131 | chrom: ["1", "1", "1", "1"], 132 | pos: [101, 102, 103, 104], 133 | genotype: ["AA", "CC", "GG", "TT"], 134 | ); 135 | 136 | $this->assertEquals($expected, $s->homozygous(), "Frames are not equal!"); 137 | } 138 | 139 | public function test_valid_False() 140 | { 141 | foreach ($this->empty_snps() as $snps) { 142 | $this->assertFalse($snps->isValid()); 143 | } 144 | } 145 | 146 | public function test_valid_True() 147 | { 148 | $s = new SNPs("tests/input/generic.csv"); 149 | $this->assertTrue($s->isValid()); 150 | } 151 | 152 | public function test_only_detect_source() 153 | { 154 | $s = new SNPs("tests/input/generic.csv", true); 155 | $this->assertEquals($s->getSource(), "generic"); 156 | $this->assertEquals(count($s), 0); 157 | } 158 | 159 | public function test_summary() 160 | { 161 | $s = new SNPs("tests/input/GRCh38.csv"); 162 | $this->assertEquals( 163 | $s->getSummary(), 164 | [ 165 | "source" => "generic", 166 | "assembly" => "GRCh38", 167 | "build" => 38, 168 | "build_detected" => true, 169 | "count" => 4, 170 | "chromosomes" => "1, 3", 171 | "sex" => "", 172 | ] 173 | ); 174 | } 175 | 176 | public function test_summary_no_snps() 177 | { 178 | foreach ($this->empty_snps() as $snps) { 179 | $this->assertEquals($snps->getSummary(), []); 180 | } 181 | } 182 | 183 | public function test_chromosomes() 184 | { 185 | $s = new SNPs("tests/input/chromosomes.csv"); 186 | $this->assertEquals(["1", "2", "3", "5", "PAR", "MT"], $s->getChromosomes()); 187 | } 188 | 189 | public function test_chromosomes_no_snps() 190 | { 191 | foreach ($this->empty_snps() as $snps) { 192 | $this->assertEmpty($snps->getChromosomes()); 193 | } 194 | } 195 | 196 | public function test_sex_Female_X_chrom() 197 | { 198 | $s = $this->simulate_snps( 199 | chrom: "X", 200 | pos_start: 1, 201 | pos_max: 155270560, 202 | pos_step: 10000, 203 | genotype: "AC" 204 | ); 205 | $this->assertEquals("Female", $s->getSex()); 206 | } 207 | 208 | public function test_sex_Female_Y_chrom() 209 | { 210 | $s = $this->simulate_snps( 211 | chrom: "Y", 212 | pos_start: 1, 213 | pos_max: 59373566, 214 | pos_step: 10000, 215 | null_snp_step: 1 216 | ); 217 | $this->assertEquals("Female", $s->getSex()); 218 | } 219 | 220 | public function test_sex_Male_X_chrom() 221 | { 222 | $s = $this->simulate_snps( 223 | chrom: "X", 224 | pos_start: 1, 225 | pos_max: 155270560, 226 | pos_step: 10000, 227 | genotype: "AA" 228 | ); 229 | $this->assertEquals(15528, $s->count()); 230 | $s->deduplicateXYChrom(); 231 | $this->assertEquals(15528, $s->count()); 232 | $this->assertEquals(0, count($s->getDiscrepantXY())); 233 | $this->assertEquals("Male", $s->getSex()); 234 | } 235 | 236 | public function test_sex_Male_X_chrom_discrepant_XY() 237 | { 238 | $s = $this->simulate_snps( 239 | chrom: "X", 240 | pos_start: 1, 241 | pos_max: 155270560, 242 | pos_step: 10000, 243 | genotype: "AA" 244 | ); 245 | $this->assertEquals(15528, $s->count()); 246 | $s->setValue("rs8001", "genotype", "AC"); 247 | $s->deduplicateXYChrom(); 248 | $this->assertEquals(15527, $s->count()); 249 | $result = $this->create_snp_df( 250 | rsid: ["rs8001"], 251 | chrom: ["X"], 252 | pos: [80000001], 253 | genotype: ["AC"] 254 | ); 255 | $this->assertEquals($result, $s->getDiscrepantXY()); 256 | $this->assertEquals("Male", $s->getSex()); 257 | } 258 | 259 | public function test_sex_male_Y_chrom() 260 | { 261 | $s = $this->simulate_snps( 262 | chrom: "Y", 263 | pos_start: 1, 264 | pos_max: 59373566, 265 | pos_step: 10000 266 | ); 267 | 268 | $this->assertEquals("Male", $s->getSex()); 269 | } 270 | 271 | public function test_sex_not_determined() 272 | { 273 | $s = $this->simulate_snps( 274 | chrom: "1", 275 | pos_start: 1, 276 | pos_max: 249250621, 277 | pos_step: 10000 278 | ); 279 | 280 | $this->assertEquals("", $s->getSex()); 281 | } 282 | 283 | public function test_sex_no_snps() 284 | { 285 | foreach ($this->empty_snps() as $snps) { 286 | $this->assertEmpty($snps->getSex()); 287 | } 288 | } 289 | 290 | public function test_source() 291 | { 292 | $s = new SNPs("tests/input/generic.csv"); 293 | $this->assertEquals("generic", $s->getSource()); 294 | $this->assertEquals(["generic"], $s->getAllSources()); 295 | } 296 | 297 | public function test_source_no_snps() 298 | { 299 | foreach ($this->empty_snps() as $snps) { 300 | $this->assertEmpty($snps->getSource()); 301 | } 302 | } 303 | 304 | public function test_count() 305 | { 306 | $s = new SNPs("tests/input/NCBI36.csv"); 307 | $this->assertEquals(4, $s->count()); 308 | } 309 | 310 | public function test_count_no_snps() 311 | { 312 | foreach ($this->empty_snps() as $snps) { 313 | $this->assertEquals(0, $snps->count()); 314 | $this->assertEmpty($snps->getSnps()); 315 | } 316 | } 317 | 318 | public function testDeduplicateFalse() 319 | { 320 | $snps = new SNPs("tests/input/duplicate_rsids.csv", deduplicate: false); 321 | $result = $this->create_snp_df(["rs1", "rs1", "rs1"], ["1", "1", "1"], [101, 102, 103], ["AA", "CC", "GG"]); 322 | $this->assertEquals($result, $snps->getSnps()); 323 | } 324 | 325 | public function testDeduplicateMTChrom() 326 | { 327 | $snps = new SNPs("tests/input/ancestry_mt.txt"); 328 | $result = $this->create_snp_df(["rs1", "rs2"], ["MT", "MT"], [101, 102], ["A", null]); 329 | $this->assertEquals($result, $snps->getSnps()); 330 | 331 | $heterozygousMTSnps = $this->create_snp_df(["rs3"], ["MT"], [103], ["GC"]); 332 | $this->assertEquals($heterozygousMTSnps, $snps->getHeterozygousMT()); 333 | } 334 | 335 | public function testDeduplicateMTChromFalse() 336 | { 337 | $snps = new SNPs("tests/input/ancestry_mt.txt", deduplicate: false); 338 | $result = $this->create_snp_df(["rs1", "rs2", "rs3"], ["MT", "MT", "MT"], [101, 102, 103], ["AA", null, "GC"]); 339 | $this->assertEquals($result, $snps->getSnps()); 340 | } 341 | 342 | public function testDuplicateRsids() 343 | { 344 | $snps = new SNPs("tests/input/duplicate_rsids.csv"); 345 | $result = $this->create_snp_df(["rs1"], ["1"], [101], ["AA"]); 346 | $duplicate = $this->create_snp_df(["rs1", "rs1"], ["1", "1"], [102, 103], ["CC", "GG"]); 347 | $this->assertEquals($result, $snps->getSnps()); 348 | $this->assertEquals($duplicate, $snps->getDuplicate()); 349 | } 350 | 351 | public function testRemap36to37() 352 | { 353 | $this->_run_remap_test(function () { 354 | $s = new SNPs("tests/input/NCBI36.csv"); 355 | list($chromosomes_remapped, $chromosomes_not_remapped) = $s->remap(37); 356 | $this->assertEquals(37, $s->getBuild()); 357 | $this->assertEquals("GRCh37", $s->getAssembly()); 358 | $this->assertCount(2, $chromosomes_remapped); 359 | $this->assertCount(0, $chromosomes_not_remapped); 360 | $this->assertEquals($this->snps_GRCh37(), $s->getSnps()); 361 | }, $this->NCBI36_GRCh37()); 362 | } 363 | 364 | public function testRemap37to36() 365 | { 366 | $this->_run_remap_test(function () { 367 | $s = new SNPs("tests/input/GRCh37.csv"); 368 | list($chromosomes_remapped, $chromosomes_not_remapped) = $s->remap(36); 369 | $this->assertEquals(36, $s->getBuild()); 370 | $this->assertEquals("NCBI36", $s->getAssembly()); 371 | $this->assertCount(2, $chromosomes_remapped); 372 | $this->assertCount(0, $chromosomes_not_remapped); 373 | $this->assertEquals($this->snps_NCBI36(), $s->getSnps()); 374 | }, $this->GRCh37_NCBI36()); 375 | } 376 | 377 | public function testRemap37to38() 378 | { 379 | $this->_run_remap_test(function () { 380 | $s = new SNPs("tests/input/GRCh37.csv"); 381 | list($chromosomes_remapped, $chromosomes_not_remapped) = $s->remap(38); 382 | $this->assertEquals(38, $s->getBuild()); 383 | $this->assertEquals("GRCh38", $s->getAssembly()); 384 | $this->assertCount(2, $chromosomes_remapped); 385 | $this->assertCount(0, $chromosomes_not_remapped); 386 | $this->assertEquals($this->snps_GRCh38(), $s->getSnps()); 387 | }, $this->GRCh37_GRCh38()); 388 | } 389 | 390 | public function testRemap37to37() 391 | { 392 | $s = new SNPs("tests/input/GRCh37.csv"); 393 | list($chromosomes_remapped, $chromosomes_not_remapped) = $s->remap(37); 394 | $this->assertEquals(37, $s->getBuild()); 395 | $this->assertEquals("GRCh37", $s->getAssembly()); 396 | $this->assertCount(0, $chromosomes_remapped); 397 | $this->assertCount(2, $chromosomes_not_remapped); 398 | $this->assertEquals($this->snps_GRCh37(), $s->getSnps()); 399 | } 400 | 401 | public function testRemapInvalidAssembly() 402 | { 403 | $s = new SNPs("tests/input/GRCh37.csv"); 404 | list($chromosomes_remapped, $chromosomes_not_remapped) = $s->remap(-1); 405 | $this->assertEquals(37, $s->getBuild()); 406 | $this->assertEquals("GRCh37", $s->getAssembly()); 407 | $this->assertCount(0, $chromosomes_remapped); 408 | $this->assertCount(2, $chromosomes_not_remapped); 409 | } 410 | 411 | public function testRemapNoSnps() 412 | { 413 | $s = new SNPs(); 414 | list($chromosomes_remapped, $chromosomes_not_remapped) = $s->remap(38); 415 | $this->assertFalse($s->getBuild()); 416 | $this->assertCount(0, $chromosomes_remapped); 417 | $this->assertCount(0, $chromosomes_not_remapped); 418 | } 419 | 420 | public function testSaveToTsv() 421 | { 422 | $s = new SNPs("tests/input/generic.csv"); 423 | $tempFile = tempnam(sys_get_temp_dir(), 'snps_test'); 424 | $s->toTsv($tempFile); 425 | $content = file_get_contents($tempFile); 426 | $this->assertStringStartsWith("# Generated by snps", $content); 427 | unlink($tempFile); 428 | } 429 | 430 | public function testSaveNoSNPs() 431 | { 432 | $s = new SNPs(); 433 | $this->assertFalse($s->toTsv()); 434 | } 435 | 436 | public function testSaveNoSNPsVCF() 437 | { 438 | $s = new SNPs(); 439 | $this->assertFalse($s->toVcf()); 440 | } 441 | 442 | public function testSaveSource() 443 | { 444 | $tmpdir = sys_get_temp_dir(); 445 | $s = new SNPs("tests/input/GRCh38.csv", outputDir: $tmpdir); 446 | $dest = $tmpdir . DIRECTORY_SEPARATOR . "generic_GRCh38.txt"; 447 | $this->assertEquals($s->toTsv(), $dest); 448 | $snps = new SNPs($dest); 449 | $this->assertEquals($snps->getBuild(), 38); 450 | $this->assertTrue($snps->isBuildDetected()); 451 | $this->assertEquals($snps->getSource(), "generic"); 452 | $this->assertEquals($snps->getAllSources(), ["generic"]); 453 | $this->assertEquals($this->snps_GRCh38(), $snps->getSnps()); 454 | } 455 | 456 | public function testCluster() 457 | { 458 | $this->runClusterTest(function ($mock) { 459 | $s = new SNPs("tests/input/23andme.txt", resources: $mock); 460 | $this->assertEquals($s->getCluster(), "c1"); 461 | }, $this->getChipClusters()); 462 | } 463 | 464 | public function testChip() 465 | { 466 | $this->runClusterTest(function ($mock) { 467 | $s = new SNPs("tests/input/23andme.txt", resources: $mock); 468 | $this->assertEquals($s->getChip(), "HTS iSelect HD"); 469 | }, $this->_getChipClusters()); 470 | } 471 | 472 | public function testChipVersion() 473 | { 474 | $this->runClusterTest(function ($mock) { 475 | $s = new SNPs("tests/input/23andme.txt", resources: $mock); 476 | $this->assertEquals($s->getChipVersion(), "v4"); 477 | }, $this->getChipClusters()); 478 | } 479 | 480 | public function testComputeClusterOverlap() 481 | { 482 | $this->runClusterTest(function ($mock) { 483 | $s = new SNPs("tests/input/23andme.txt", resources: $mock); 484 | $result = $s->computeClusterOverlap(); 485 | $this->assertEquals($s->getCluster(), "c1"); 486 | $this->assertEquals($s->getChip(), "HTS iSelect HD"); 487 | $this->assertEquals($s->getChipVersion(), "v4"); 488 | $this->assertArrayHasKey("c1", $result); 489 | }, $this->_getChipClusters()); 490 | } 491 | 492 | public function testSnpsQc() 493 | { 494 | $s = new SNPs("tests/input/generic.csv"); 495 | $snpsQc = $s->getSnpsQc(); 496 | $expectedQcSnps = $this->genericSnps(); 497 | unset($expectedQcSnps['rs4']); 498 | unset($expectedQcSnps['rs6']); 499 | $this->assertEquals($expectedQcSnps, $snpsQc); 500 | } 501 | 502 | public function testLowQuality() 503 | { 504 | $s = new SNPs("tests/input/generic.csv"); 505 | $lowQualitySnps = $s->getLowQualitySnps(); 506 | $expectedLowQualitySnps = $this->genericSnps(); 507 | $this->assertEquals($expectedLowQualitySnps, $lowQualitySnps); 508 | } 509 | 510 | // Add more tests for SNPData and SNPAnalyzer classes 511 | public function testSNPData() 512 | { 513 | $snpData = new SNPData($this->genericSnps()); 514 | $this->assertEquals(8, $snpData->count()); 515 | $this->assertEquals(["1"], $snpData->getChromosomes()); 516 | } 517 | 518 | public function testSNPAnalyzer() 519 | { 520 | $buildDetector = $this->createMock(BuildDetector::class); 521 | $buildDetector->method('detectBuild')->willReturn(37); 522 | 523 | $clusterOverlapCalculator = $this->createMock(ClusterOverlapCalculator::class); 524 | $clusterOverlapCalculator->method('computeClusterOverlap')->willReturn(['cluster' => 'c1']); 525 | 526 | $snpAnalyzer = new SNPAnalyzer($buildDetector, $clusterOverlapCalculator); 527 | $snpData = new SNPData($this->genericSnps()); 528 | 529 | $this->assertEquals(37, $snpAnalyzer->detectBuild($snpData)); 530 | $this->assertEquals(['cluster' => 'c1'], $snpAnalyzer->computeClusterOverlap($snpData)); 531 | $this->assertEquals('Female', $snpAnalyzer->determineSex($snpData)); 532 | } 533 | } 534 | -------------------------------------------------------------------------------- /src/Snps/VariedicInherit.php: -------------------------------------------------------------------------------- 1 | config = $config; 42 | $required = [ 43 | self::KEY_CALLBACK, 44 | self::KEY_REMOVED, 45 | self::KEY_MAGIC, 46 | self::KEY_RESOURCE, 47 | ]; 48 | 49 | foreach ($required as $key) { 50 | if (!isset($this->config[$key])) { 51 | $message = sprintf(self::ERR_MISSING_KEY, $key); 52 | throw new InvalidArgumentException($message); 53 | } 54 | } 55 | } 56 | 57 | /** 58 | * Get the contents of a file. 59 | * 60 | * @param string $filePath Path to the file to scan 61 | * @return string The file contents with line breaks replaced by spaces 62 | * @throws InvalidArgumentException If the file is not found 63 | */ 64 | public function getFileContents(string $filePath): string 65 | { 66 | if (!file_exists($filePath)) { 67 | $this->contents = ''; 68 | throw new InvalidArgumentException( 69 | sprintf(self::ERR_FILE_NOT_FOUND, $filePath) 70 | ); 71 | } 72 | 73 | $this->clearMessages(); 74 | $this->contents = file_get_contents($filePath); 75 | $this->contents = str_replace(["\r", "\n"], ['', ' '], $this->contents); 76 | 77 | return $this->contents; 78 | } 79 | 80 | /** 81 | * Extracts the value immediately following the supplied word up until the supplied end 82 | * 83 | * @param string $contents : text to search (usually $this->contents) 84 | * @param string $key : starting keyword or set of characters 85 | * @param string $delim : ending delimiter 86 | * @return string $name : classnames 87 | */ 88 | /** 89 | * Get the value of a key from a string. 90 | * 91 | * @param string $contents The string to search 92 | * @param string $key The key to search for 93 | * @param string $delimiter The delimiter to use 94 | * @return string The value of the key, or an empty string if not found 95 | */ 96 | public static function getKeyValue( 97 | string $contents, 98 | string $key, 99 | string $delimiter 100 | ): string { 101 | $position = strpos($contents, $key); 102 | 103 | if ($position === false) { 104 | return ''; 105 | } 106 | 107 | $end = strpos($contents, $delimiter, $position + strlen($key) + 1); 108 | $value = substr( 109 | $contents, 110 | $position + strlen($key), 111 | $end - $position - strlen($key) 112 | ); 113 | 114 | return is_string($value) ? trim($value) : ''; 115 | } 116 | 117 | /** 118 | * Clears messages 119 | * 120 | * @return void 121 | */ 122 | public function clearMessages() : void 123 | { 124 | $this->messages = []; 125 | $this->magic = []; 126 | } 127 | 128 | /** 129 | * Returns messages 130 | * 131 | * @param bool $clear : If TRUE, reset messages to [] 132 | * @return array $messages : accumulated messages 133 | */ 134 | public function getMessages(bool $clear = FALSE) : array 135 | { 136 | $messages = $this->messages; 137 | if ($clear) $this->clearMessages(); 138 | return $messages; 139 | } 140 | 141 | /** 142 | * Returns 0 and adds OK message 143 | * 144 | * @param string $function 145 | * @return int 0 146 | */ 147 | public function passedOK(string $function) : int 148 | { 149 | $this->messages[] = sprintf(self::OK_PASSED, $function); 150 | return 0; 151 | } 152 | 153 | /** 154 | * Runs all scans 155 | * 156 | * @return int $found : number of potential BC breaks found 157 | */ 158 | public function runAllScans() : int 159 | { 160 | $found = 0; 161 | $found += $this->scanRemovedFunctions(); 162 | $found += $this->scanIsResource(); 163 | $found += $this->scanMagicSignatures(); 164 | echo __METHOD__ . ':' . var_export($this->messages, TRUE) . "\n"; 165 | $found += $this->scanFromCallbacks(); 166 | return $found; 167 | } 168 | /** 169 | * Check for removed functions 170 | * 171 | * @return int $found : number of BC breaks detected 172 | */ 173 | public function scanRemovedFunctions() : int 174 | { 175 | $found = 0; 176 | $config = $this->config[self::KEY_REMOVED] ?? NULL; 177 | // we add this extra safety check in case this method is called separately 178 | if (empty($config)) { 179 | $message = sprintf(self::ERR_MISSING_KEY, self::KEY_REMOVED); 180 | throw new Exception($message); 181 | } 182 | foreach ($config as $func => $replace) { 183 | $search1 = ' ' . $func . '('; 184 | $search2 = ' ' . $func . ' ('; 185 | if (strpos($this->contents, $search1) !== FALSE 186 | || strpos($this->contents, $search2) !== FALSE) { 187 | $this->messages[] = sprintf(self::ERR_REMOVED, $func, $replace); 188 | $found++; 189 | } 190 | } 191 | return ($found === 0) ? $this->passedOK(__FUNCTION__) : $found; 192 | } 193 | /** 194 | * Check for is_resource usage 195 | * If "is_resource" found, check against list of functions 196 | * that no longer produce resources in PHP 8 197 | * 198 | * @return int $found : number of BC breaks detected 199 | */ 200 | public function scanIsResource() : int 201 | { 202 | $found = 0; 203 | $search = 'is_resource'; 204 | // if "is_resource" not found discontinue search 205 | if (strpos($this->contents, $search) === FALSE) return $this->passedOK(__FUNCTION__); 206 | // pull list of functions that now return objects instead of resources 207 | $config = $this->config[self::KEY_RESOURCE] ?? NULL; 208 | // we add this extra safety check in case this method is called separately 209 | if (empty($config)) { 210 | $message = sprintf(self::ERR_MISSING_KEY, self::KEY_RESOURCE); 211 | throw new Exception($message); 212 | } 213 | foreach ($config as $func) { 214 | if ((strpos($this->contents, $func) !== FALSE)) { 215 | $this->messages[] = sprintf(self::ERR_IS_RESOURCE, $func); 216 | $found++; 217 | } 218 | } 219 | return ($found === 0) ? $this->passedOK(__FUNCTION__) : $found; 220 | } 221 | /** 222 | * Scan for magic method signatures 223 | * NOTE: doesn't check inside parentheses. 224 | * only checks for return data type + displays found and correct signatures for manual comparison 225 | * 226 | * @return int $found : number of invalid return data types 227 | */ 228 | public function scanMagicSignatures() : int 229 | { 230 | // locate all magic methods 231 | $found = 0; 232 | $matches = []; 233 | 234 | if (!empty($matches[1])) { 235 | $this->messages[] = self::MAGIC_METHODS; 236 | $config = $this->config[self::KEY_MAGIC] ?? NULL; 237 | // we add this extra safety check in case this method is called separately 238 | if (empty($config)) { 239 | $message = sprintf(self::ERR_MISSING_KEY, self::KEY_MAGIC); 240 | throw new Exception($message); 241 | } 242 | foreach ($matches[1] as $name) { 243 | $key = '__' . $name; 244 | // skip if key not found. must not be a defined magic method 245 | if (!isset($config[$key])) continue; 246 | // record official signature 247 | $this->messages[] = 'Signature: ' . ($config[$key]['signature'] ?? 'Signature not found'); 248 | $sub = $this->getKeyValue($this->contents, $key, '{'); 249 | if ($sub) { 250 | $sub = $key . $sub; 251 | // record found signature 252 | $this->messages[] = 'Actual : ' . $sub; 253 | // look for return type 254 | if (strpos($sub, ':')) { 255 | $ptn = '/.*?\(.*?\)\s*:\s*' . $config[$key]['return'] . '/'; 256 | // test for a match 257 | if (!preg_match($ptn, $sub)) { 258 | $this->messages[] = sprintf(self::ERR_MAGIC_SIGNATURE, $key); 259 | $found++; 260 | } 261 | } 262 | } 263 | } 264 | } 265 | //echo __METHOD__ . ':' . var_export($this->messages, TRUE) . "\n"; 266 | return ($found === 0) ? $this->passedOK(__FUNCTION__) : $found; 267 | } 268 | /** 269 | * Runs all scans key as defined in $this->config (bc_break_scanner.config.php) 270 | * 271 | * @return int $found : number of potential BC breaks found 272 | */ 273 | public function scanFromCallbacks() 274 | { 275 | $found = 0; 276 | $list = array_keys($this->config[self::KEY_CALLBACK]); 277 | foreach ($list as $key) { 278 | $config = $this->config[self::KEY_CALLBACK][$key] ?? NULL; 279 | if (empty($config['callback']) || !is_callable($config['callback'])) { 280 | $message = sprintf(self::ERR_INVALID_KEY, self::KEY_CALLBACK . ' => ' . $key . ' => callback'); 281 | throw new InvalidArgumentException($message); 282 | } 283 | if ($config['callback']($this->contents)) { 284 | $this->messages[] = $config['msg']; 285 | $found++; 286 | } 287 | } 288 | return $found; 289 | } 290 | 291 | /** 292 | * Get homozygous SNPs for a given chromosome. 293 | * 294 | * @param string $chromosome The chromosome to get homozygous SNPs for 295 | * @return mixed The result of the homozygous() method 296 | * @deprecated Use the homozygous() method instead 297 | */ 298 | public function homozygous_snps(string $chromosome = '') 299 | { 300 | trigger_error( 301 | 'This method has been renamed to `homozygous`.', 302 | E_USER_DEPRECATED 303 | ); 304 | 305 | return $this->homozygous($chromosome); 306 | } 307 | 308 | /** 309 | * Check if the object is valid. 310 | * 311 | * @return bool The value of the "valid" property 312 | * @deprecated Use the "valid" property instead 313 | */ 314 | public function is_valid(): bool 315 | { 316 | trigger_error( 317 | 'This method has been renamed to `valid` and is now a property.', 318 | E_USER_DEPRECATED 319 | ); 320 | 321 | return $this->valid; 322 | } 323 | 324 | /** 325 | * Predict ancestry using the ezancestry package. 326 | * 327 | * @param string|null $outputDirectory The output directory for predictions 328 | * @param bool $writePredictions Whether to write the predictions to files 329 | * @param string|null $modelsDirectory The directory containing the models 330 | * @param string|null $aisnpsDirectory The directory containing the AIsnps 331 | * @param int|null $nComponents The number of components for the model 332 | * @param int|null $k The number of nearest neighbors to use 333 | * @param string|null $thousandGenomesDirectory The directory containing the 1000 Genomes data 334 | * @param string|null $samplesDirectory The directory containing the samples 335 | * @param string|null $algorithm The algorithm to use for prediction 336 | * @param string|null $aisnpsSet The set of AIsnps to use 337 | * @return array The predicted ancestry values 338 | * @throws Exception If the ezancestry package is not installed 339 | */ 340 | public function predict_ancestry( 341 | ?string $outputDirectory = null, 342 | bool $writePredictions = false, 343 | ?string $modelsDirectory = null, 344 | ?string $aisnpsDirectory = null, 345 | ?int $nComponents = null, 346 | ?int $k = null, 347 | ?string $thousandGenomesDirectory = null, 348 | ?string $samplesDirectory = null, 349 | ?string $algorithm = null, 350 | ?string $aisnpsSet = null 351 | ): array { 352 | return $this->getPredictions( 353 | $outputDirectory, 354 | $writePredictions, 355 | $modelsDirectory, 356 | $aisnpsDirectory, 357 | $nComponents, 358 | $k, 359 | $thousandGenomesDirectory, 360 | $samplesDirectory, 361 | $algorithm, 362 | $aisnpsSet 363 | ); 364 | } 365 | 366 | /** 367 | * Get ancestry predictions using the ezancestry package. 368 | * 369 | * @param string|null $outputDirectory The output directory for predictions 370 | * @param bool $writePredictions Whether to write the predictions to files 371 | * @param string|null $modelsDirectory The directory containing the models 372 | * @param string|null $aisnpsDirectory The directory containing the AIsnps 373 | * @param int|null $nComponents The number of components for the model 374 | * @param int|null $k The number of nearest neighbors to use 375 | * @param string|null $thousandGenomesDirectory The directory containing the 1000 Genomes data 376 | * @param string|null $samplesDirectory The directory containing the samples 377 | * @param string|null $algorithm The algorithm to use for prediction 378 | * @param string|null $aisnpsSet The set of AIsnps to use 379 | * @return array The predicted ancestry values 380 | * @throws Exception If the ezancestry package is not installed or the object is not valid 381 | */ 382 | public function getPredictions( 383 | ?string $outputDirectory = null, 384 | bool $writePredictions = false, 385 | ?string $modelsDirectory = null, 386 | ?string $aisnpsDirectory = null, 387 | ?int $nComponents = null, 388 | ?int $k = null, 389 | ?string $thousandGenomesDirectory = null, 390 | ?string $samplesDirectory = null, 391 | ?string $algorithm = null, 392 | ?string $aisnpsSet = null 393 | ): array { 394 | if (!$this->valid) { 395 | return []; 396 | } 397 | 398 | if (!class_exists('ezancestry\commands\Predict')) { 399 | throw new Exception( 400 | 'Ancestry prediction requires the ezancestry package; please install it' 401 | ); 402 | } 403 | 404 | $predict = new ezancestry\commands\Predict(); 405 | 406 | $predictions = $predict->predict( 407 | $this->snps, 408 | $outputDirectory, 409 | $writePredictions, 410 | $modelsDirectory, 411 | $aisnpsDirectory, 412 | $nComponents, 413 | $k, 414 | $thousandGenomesDirectory, 415 | $samplesDirectory, 416 | $algorithm, 417 | $aisnpsSet 418 | ); 419 | 420 | $maxPopValues = $this->maxPop($predictions[0]); 421 | $maxPopValues['ezancestry_df'] = $predictions; 422 | 423 | return $maxPopValues; 424 | } 425 | 426 | /** 427 | * Get the maximum population values from a prediction row. 428 | * 429 | * @param array $row The prediction row 430 | * @return array The maximum population values 431 | */ 432 | private function maxPop(array $row): array 433 | { 434 | $populationCode = $row['predicted_population_population']; 435 | $populationDescription = $row['population_description']; 436 | $populationPercent = $row[$populationCode]; 437 | $superpopulationCode = $row['predicted_population_superpopulation']; 438 | $superpopulationDescription = $row['superpopulation_name']; 439 | $superpopulationPercent = $row[$superpopulationCode]; 440 | 441 | return [ 442 | 'population_code' => $populationCode, 443 | 'population_description' => $populationDescription, 444 | '_percent' => $populationPercent, 445 | 'superpopulation_code' => $superpopulationCode, 446 | 'superpopulation_description' => $superpopulationDescription, 447 | 'population_percent' => $superpopulationPercent, 448 | ]; 449 | } 450 | 451 | /** 452 | * Compute cluster overlap based on a given threshold. 453 | * 454 | * @param float $clusterOverlapThreshold The threshold for cluster overlap 455 | * @return DataFrame The computed cluster overlap DataFrame 456 | */ 457 | public function computeClusterOverlap(float $clusterOverlapThreshold = 0.95): DataFrame 458 | { 459 | $data = [ 460 | 'cluster_id' => ['c1', 'c3', 'c4', 'c5', 'v5'], 461 | 'company_composition' => [ 462 | '23andMe-v4', 463 | 'AncestryDNA-v1, FTDNA, MyHeritage', 464 | '23andMe-v3', 465 | 'AncestryDNA-v2', 466 | '23andMe-v5, LivingDNA', 467 | ], 468 | 'chip_base_deduced' => [ 469 | 'HTS iSelect HD', 470 | 'OmniExpress', 471 | 'OmniExpress plus', 472 | 'OmniExpress plus', 473 | 'Illumina GSAs', 474 | ], 475 | 'snps_in_cluster' => array_fill(0, 5, 0), 476 | 'snps_in_common' => array_fill(0, 5, 0), 477 | ]; 478 | 479 | $df = new DataFrame($data); 480 | $df->setIndex('cluster_id'); 481 | 482 | $toRemap = null; 483 | 484 | if ($this->build !== 37) { 485 | $toRemap = clone $this; 486 | $toRemap->remap(37); 487 | $selfSnps = $toRemap->snps()->select(['chrom', 'pos'])->dropDuplicates(); 488 | } else { 489 | $selfSnps = $this->snps()->select(['chrom', 'pos'])->dropDuplicates(); 490 | } 491 | 492 | $chipClusters = $this->resources->getChipClusters(); 493 | 494 | foreach ($df->indexValues() as $cluster) { 495 | $clusterSnps = $chipClusters->filter( 496 | function ($row) use ($cluster) { 497 | return strpos($row['clusters'], $cluster) !== false; 498 | } 499 | )->select(['chrom', 'pos']); 500 | 501 | $df->loc[$cluster]['snps_in_cluster'] = count($clusterSnps); 502 | $df->loc[$cluster]['snps_in_common'] = count($selfSnps->merge($clusterSnps, 'inner')); 503 | 504 | $df['overlap_with_cluster'] = $df['snps_in_common'] / $df['snps_in_cluster']; 505 | $df['overlap_with_self'] = $df['snps_in_common'] / count($selfSnps); 506 | 507 | $maxOverlap = array_keys($df['overlap_with_cluster'], max($df['overlap_with_cluster']))[0]; 508 | 509 | if ( 510 | $df['overlap_with_cluster'][$maxOverlap] > $clusterOverlapThreshold 511 | && $df['overlap_with_self'][$maxOverlap] > $clusterOverlapThreshold 512 | ) { 513 | $this->cluster = $maxOverlap; 514 | $this->chip = $df['chip_base_deduced'][$maxOverlap]; 515 | 516 | $companyComposition = $df['company_composition'][$maxOverlap]; 517 | 518 | if (strpos($companyComposition, $this->source) !== false) { 519 | if ($this->source === '23andMe' || $this->source === 'AncestryDNA') { 520 | $i = strpos($companyComposition, 'v'); 521 | $this->chip_version = substr($companyComposition, $i, $i + 2); 522 | } 523 | } else { 524 | // Log a warning about the SNPs data source not found 525 | } 526 | } 527 | } 528 | 529 | return $df; 530 | } 531 | } --------------------------------------------------------------------------------