├── tests
├── input
│ ├── empty.txt
│ ├── duplicate_rsids.csv
│ ├── ancestry_mt.txt
│ ├── chromosomes.csv
│ ├── generic_no_header.tsv
│ ├── GRCh37.csv
│ ├── GRCh38.csv
│ ├── NCBI36.csv
│ ├── GRCh37_PAR.csv
│ ├── generic.fa
│ ├── generic.csv
│ ├── generic.tsv
│ ├── tellmeGen.txt
│ ├── generic_header_comment.tsv
│ ├── generic_multi_rsid.tsv
│ ├── generic_non_standard_columns.tsv
│ ├── generic_extra_column.tsv
│ ├── ftdna.csv
│ ├── codigo46.txt
│ ├── ftdna_famfinder.csv
│ ├── circledna.txt
│ ├── DNALand.txt
│ ├── genesforgood.txt
│ ├── myheritage.csv
│ ├── livingdna.csv
│ ├── sano.txt
│ ├── 23andme_allele.txt
│ ├── 23andme.txt
│ ├── ancestry.txt
│ ├── ancestry_multi_sep.txt
│ ├── 23andme_win.txt
│ ├── myheritage_extra_quotes.csv
│ ├── unannotated_testvcf.vcf
│ ├── testvcf.vcf
│ ├── testvcf_phased.vcf
│ ├── testvcf_chr_prefix.vcf
│ ├── testvcf_multi_sample.vcf
│ ├── mapmygenome_alt_header.txt
│ ├── mapmygenome.txt
│ └── discrepant_snps.csv
├── resources
│ ├── dbsnp_151_37_reverse.txt
│ ├── gsa_rsid_map.txt
│ └── gsa_chrpos_map.txt
├── IndividualTest.php
├── SNPsTest.php
└── Snps
│ ├── IO
│ ├── ReaderTest.php
│ └── WriterTes.php
│ ├── SnpsMergeTest.php
│ └── SnpsTest.php
├── .github
├── FUNDING.yml
└── ISSUE_TEMPLATE
│ └── sweep-template.yml
├── .gitignore
├── .vscode
└── settings.json
├── index.php
├── resources
└── fasta
│ └── GRCh37
│ └── Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz
├── rector.php
├── src
├── Snps
│ ├── IO
│ │ ├── IO.php
│ │ ├── ExtraTabsFilter.php
│ │ ├── AdditionalFile.php
│ │ ├── SnpFileReader.php
│ │ ├── PythonDependency.php
│ │ ├── DataParser.php
│ │ ├── CsvReader.php
│ │ ├── PhpDataFrame.php
│ │ └── Writer.php
│ ├── Singleton.php
│ ├── Analysis
│ │ ├── BuildDetector.php
│ │ └── ClusterOverlapCalculator.php
│ ├── SNPData.php
│ ├── Resources.php
│ ├── SortTest.php
│ ├── ReferenceSequence.php
│ ├── SNPAnalyzer.php
│ ├── Ensembl.php
│ ├── ReferenceSequenceManager.php
│ ├── DatasetDownloader.php
│ ├── DocBlockChecker.php
│ ├── AssemblyMappingManager.php
│ ├── Utils.php
│ ├── EnsemblRestClient.php
│ ├── PythonDependency.php
│ └── VariedicInherit.php
├── Utils
│ └── ColorSchemeGenerator.php
├── KitLoader.php
├── Helpers
│ └── CSVGenerator.php
├── Dna.php
├── Individual.php
├── Triangulation.php
├── Visualization.php
├── MatchKits.php
└── Resources.php
├── phpunit.xml
├── .travis.yml
├── CONTRIBUTE.md
├── composer.json
├── LICENSE
├── sweep.yaml
├── README.md
└── phpconvcount.py
/tests/input/empty.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: liberu-genealogy
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | vendor/*
2 | .idea/*
3 | tmp/*
4 | .phpunit.result.cache
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "cmake.configureOnOpen": false
3 | }
--------------------------------------------------------------------------------
/index.php:
--------------------------------------------------------------------------------
1 | generic test sequence:1:1:117
2 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
3 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGCCGGACNNNNNNNN
4 |
--------------------------------------------------------------------------------
/resources/fasta/GRCh37/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liberu-genealogy/php-dna/HEAD/resources/fasta/GRCh37/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz
--------------------------------------------------------------------------------
/tests/input/generic.csv:
--------------------------------------------------------------------------------
1 | rsid,chromosome,position,genotype
2 | rs1,1,101,AA
3 | rs2,1,102,CC
4 | rs3,1,103,GG
5 | rs4,1,104,TT
6 | rs5,1,105,--
7 | rs6,1,106,GC
8 | rs7,1,107,TC
9 | rs8,1,108,AT
10 |
--------------------------------------------------------------------------------
/tests/input/generic.tsv:
--------------------------------------------------------------------------------
1 | rsid chromosome position genotype
2 | rs1 1 101 AA
3 | rs2 1 102 CC
4 | rs3 1 103 GG
5 | rs4 1 104 TT
6 | rs5 1 105 --
7 | rs6 1 106 GC
8 | rs7 1 107 TC
9 | rs8 1 108 AT
10 |
--------------------------------------------------------------------------------
/tests/input/tellmeGen.txt:
--------------------------------------------------------------------------------
1 | rsid Chromosome position genotype
2 | 1:101 1 101 AA
3 | 1:102 1 102 CC
4 | 1:103 1 103 GG
5 | 1:104 1 104 TT
6 | 1:105 1 105 --
7 | 1:106 1 106 GC
8 | 1:107 1 107 TC
9 | 1:108 1 108 AT
--------------------------------------------------------------------------------
/tests/input/generic_header_comment.tsv:
--------------------------------------------------------------------------------
1 | # rsid chromosome position genotype
2 | rs1 1 101 AA
3 | rs2 1 102 CC
4 | rs3 1 103 GG
5 | rs4 1 104 TT
6 | rs5 1 105 --
7 | rs6 1 106 GC
8 | rs7 1 107 TC
9 | rs8 1 108 AT
10 |
--------------------------------------------------------------------------------
/tests/input/generic_multi_rsid.tsv:
--------------------------------------------------------------------------------
1 | rsid chromosome position genotype
2 | rs1 1 101 AA
3 | rs2 1 102 CC
4 | rs3 1 103 GG
5 | rs4 1 104 TT
6 | rs5 1 105 --
7 | rs6,rs9 1 106 GC
8 | rs7 1 107 TC
9 | rs8 1 108 AT
10 |
--------------------------------------------------------------------------------
/tests/input/generic_non_standard_columns.tsv:
--------------------------------------------------------------------------------
1 | rsid chromosome position genotype_other
2 | rs1 1 101 AA
3 | rs2 1 102 CC
4 | rs3 1 103 GG
5 | rs4 1 104 TT
6 | rs5 1 105 --
7 | rs6 1 106 GC
8 | rs7 1 107 TC
9 | rs8 1 108 AT
10 |
--------------------------------------------------------------------------------
/tests/input/generic_extra_column.tsv:
--------------------------------------------------------------------------------
1 | rsid chromosome position genotype extra
2 | rs1 1 101 AA 1
3 | rs2 1 102 CC 1
4 | rs3 1 103 GG 2
5 | rs4 1 104 TT 2
6 | rs5 1 105 -- 2
7 | rs6 1 106 GC 2
8 | rs7 1 107 TC 2
9 | rs8 1 108 AT 2
10 |
--------------------------------------------------------------------------------
/tests/resources/gsa_chrpos_map.txt:
--------------------------------------------------------------------------------
1 | Name Chr MapInfo deCODE(cM)
2 | 1:101 1 101 0.0000
3 | 1:102 1 102 0.0000
4 | 1:103 1 103 0.0000
5 | 1:104 1 104 0.0000
6 | 1:105 1 105 0.0000
7 | rs6 1 106 0.0000
8 | rs7 1 107 0.0000
9 | rs8 1 108 0.0000
10 |
--------------------------------------------------------------------------------
/tests/input/ftdna.csv:
--------------------------------------------------------------------------------
1 | RSID,CHROMOSOME,POSITION,RESULT
2 | "rs1","1","101","AA"
3 | "rs2","1","102","CC"
4 | "rs3","1","103","GG"
5 | "rs4","1","104","TT"
6 | "rs5","1","105","--"
7 | "rs6","1","106","GC"
8 | "rs7","1","107","TC"
9 | "rs8","1","108","AT"
10 |
--------------------------------------------------------------------------------
/tests/input/codigo46.txt:
--------------------------------------------------------------------------------
1 | [Header]
2 | Content CODIGO46.bpm
3 | [Data]
4 | Sample Name SNP Name Allele1 - Plus Allele2 - Plus
5 | 123 1:101 A A
6 | 123 1:102 C C
7 | 123 1:103 G G
8 | 123 1:104 T T
9 | 123 1:105 - -
10 | 123 rs6 G C
11 | 123 rs7 T C
12 | 123 rs8 A T
--------------------------------------------------------------------------------
/rector.php:
--------------------------------------------------------------------------------
1 | sets([LevelSetList::UP_TO_PHP_82]);
10 | };
11 |
--------------------------------------------------------------------------------
/tests/input/ftdna_famfinder.csv:
--------------------------------------------------------------------------------
1 | # famfinder, https://www.familytreedna.com
2 | #
3 | # name,chromosome,position,allele1,allele2
4 | rs1,1,101,A,A
5 | rs2,1,102,C,C
6 | rs3,1,103,G,G
7 | rs4,1,104,T,T
8 | rs5,1,105,-,-
9 | rs6,1,106,G,C
10 | rs7,1,107,T,C
11 | rs8,1,108,A,T
12 |
--------------------------------------------------------------------------------
/tests/input/circledna.txt:
--------------------------------------------------------------------------------
1 | # Circle
2 | #
3 | #
4 | # MARKERNAME CHROM POS GT
5 | chr1:1:A chr1 1 A/A
6 | rs1 chr1 101 A/A
7 | rs2 chr1 102 C/C
8 | rs3 chr1 103 G/G
9 | rs4 chr1 104 T/T
10 | rs6 chr1 106 G/C
11 | rs7 chr1 107 T/C
12 | rs8 chr1 108 A/T
13 | chr1:1001:A chr1 1001 A/A
14 |
--------------------------------------------------------------------------------
/tests/input/DNALand.txt:
--------------------------------------------------------------------------------
1 | # DNA.Land
2 | #
3 | #
4 | #
5 | #
6 | #
7 | #
8 | #
9 | #
10 | #
11 | #
12 | #
13 | #
14 | #
15 | # rsid chromosome position genotype
16 | rs1 1 101 AA
17 | rs2 1 102 CC
18 | rs3 1 103 GG
19 | rs4 1 104 TT
20 | rs5 1 105 --
21 | rs6 1 106 GC
22 | rs7 1 107 TC
23 | rs8 1 108 AT
--------------------------------------------------------------------------------
/tests/input/genesforgood.txt:
--------------------------------------------------------------------------------
1 | # Genes for Good
2 | #
3 | #
4 | #
5 | #
6 | #
7 | #
8 | #
9 | #
10 | # rsid chromosome position genotype
11 | rs1 1 101 AA
12 | rs2 1 102 CC
13 | rs3 1 103 GG
14 | rs4 1 104 TT
15 | rs5 1 105 --
16 | rs6 1 106 GC
17 | rs7 1 107 TC
18 | rs8 1 108 AT
19 |
--------------------------------------------------------------------------------
/tests/input/myheritage.csv:
--------------------------------------------------------------------------------
1 | # MyHeritage, https://www.myheritage.com
2 | RSID,CHROMOSOME,POSITION,RESULT
3 | "rs1","1","101","AA"
4 | "rs2","1","102","CC"
5 | "rs3","1","103","GG"
6 | "rs4","1","104","TT"
7 | "rs5","1","105","--"
8 | "rs6","1","106","GC"
9 | "rs7","1","107","TC"
10 | "rs8","1","108","AT"
11 |
--------------------------------------------------------------------------------
/tests/input/livingdna.csv:
--------------------------------------------------------------------------------
1 | # Living DNA
2 | #
3 | #
4 | #
5 | #
6 | #
7 | #
8 | #
9 | #
10 | #
11 | #
12 | #
13 | #
14 | #
15 | # rsid chromosome position genotype
16 | rs1 1 101 AA
17 | rs2 1 102 CC
18 | rs3 1 103 GG
19 | rs4 1 104 TT
20 | rs5 1 105 --
21 | rs6 1 106 GC
22 | rs7 1 107 TC
23 | rs8 1 108 AT
24 |
--------------------------------------------------------------------------------
/tests/input/sano.txt:
--------------------------------------------------------------------------------
1 | [Header]
2 | Content SANO
3 | [Data]
4 | Sample Name SNP Name Chr Position Allele1 - Forward Allele2 - Forward
5 | 123 1:101 1 101 A A
6 | 123 1:102 1 102 C C
7 | 123 1:103 1 103 G G
8 | 123 1:104 1 104 T T
9 | 123 1:105 1 105 - -
10 | 123 rs6 1 106 G C
11 | 123 rs7 1 107 A G
12 | 123 rs8 1 108 T A
--------------------------------------------------------------------------------
/tests/input/23andme_allele.txt:
--------------------------------------------------------------------------------
1 | # 23andMe
2 | #
3 | #
4 | #
5 | #
6 | #
7 | #
8 | #
9 | #
10 | #
11 | #
12 | #
13 | #
14 | #
15 | # rsid chromosome position allele1 allele2
16 | rs1 1 101 A A
17 | rs2 1 102 C C
18 | rs3 1 103 G G
19 | rs4 1 104 T T
20 | rs5 1 105 - -
21 | rs6 1 106 G C
22 | rs7 1 107 T C
23 | rs8 1 108 A T
24 |
--------------------------------------------------------------------------------
/tests/input/23andme.txt:
--------------------------------------------------------------------------------
1 | # 23andMe
2 | #
3 | #
4 | #
5 | #
6 | #
7 | #
8 | #
9 | #
10 | #
11 | #
12 | #
13 | #
14 | #
15 | # rsid chromosome position genotype
16 | rs1 1 101 AA
17 | rs2 1 102 CC
18 | rs3 1 103 GG
19 | rs4 1 104 TT
20 | rs5 1 105 --
21 | rs6 1 106 GC
22 | rs7 1 107 TC
23 | rs8 1 108 AT
24 | rs9 -- 109 AT
25 | rs10 -- -- AT
26 |
--------------------------------------------------------------------------------
/tests/input/ancestry.txt:
--------------------------------------------------------------------------------
1 | #AncestryDNA
2 | #
3 | #
4 | #
5 | #
6 | #
7 | #
8 | #
9 | #
10 | #
11 | #
12 | #
13 | #
14 | #
15 | #
16 | #
17 | #
18 | #
19 | rsid chromosome position allele1 allele2
20 | rs1 1 101 A A
21 | rs2 1 102 C C
22 | rs3 1 103 G G
23 | rs4 1 104 T T
24 | rs5 1 105 0 0
25 | rs6 1 106 G C
26 | rs7 1 107 T C
27 | rs8 1 108 A T
28 |
--------------------------------------------------------------------------------
/src/Snps/IO/IO.php:
--------------------------------------------------------------------------------
1 | null, "chrom" => null, "pos" => null, "genotype" => null);
10 | // $df = array();
11 | // $df[] = $columns;
12 | return [];
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/tests/input/ancestry_multi_sep.txt:
--------------------------------------------------------------------------------
1 | #AncestryDNA
2 | #
3 | #
4 | #
5 | #
6 | #
7 | #
8 | #
9 | #
10 | #
11 | #
12 | #
13 | #
14 | #
15 | #
16 | #
17 | #
18 | #
19 | rsid chromosome position allele1 allele2
20 | rs1 1 101 A A
21 | rs2 1 102 C C
22 | rs3 1 103 G G
23 | rs4 1 104 T T
24 | rs5 1 105 0 0
25 | rs6 1 106 G C
26 | rs7 1 107 T C
27 | rs8 1 108 A T
28 |
--------------------------------------------------------------------------------
/tests/input/23andme_win.txt:
--------------------------------------------------------------------------------
1 | # 23andMe
2 | #
3 | #
4 | #
5 | #
6 | #
7 | #
8 | #
9 | #
10 | #
11 | #
12 | #
13 | #
14 | #
15 | # rsid chromosome position genotype
16 | rs1 1 101 AA
17 | rs2 1 102 CC
18 | rs3 1 103 GG
19 | rs4 1 104 TT
20 | rs5 1 105 --
21 | rs6 1 106 GC
22 | rs7 1 107 TC
23 | rs8 1 108 AT
24 | rs9 -- 109 AT
25 | rs10 -- -- AT
26 |
--------------------------------------------------------------------------------
/tests/input/myheritage_extra_quotes.csv:
--------------------------------------------------------------------------------
1 | # MyHeritage, https://www.myheritage.com
2 | RSID,CHROMOSOME,POSITION,RESULT
3 | "rs1"",""1"",""101"",""AA"
4 | "rs2"",""1"",""102"",""CC"
5 | "rs3"",""1"",""103"",""GG"
6 | "rs4"",""1"",""104"",""TT"
7 | "rs5"",""1"",""105"",""--"
8 | "rs6"",""1"",""106"",""GC"
9 | "rs7"",""1"",""107"",""TC"
10 | "rs8"",""1"",""108"",""AT"
11 |
--------------------------------------------------------------------------------
/phpunit.xml:
--------------------------------------------------------------------------------
1 |
4 |
5 |
6 |
7 | ./tests
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: php
2 |
3 | php:
4 | - 5.6
5 | - 5.5
6 | - 5.4
7 | - 7
8 | - hhvm
9 | - hhvm-nightly
10 |
11 | matrix:
12 | fast_finish: true
13 | allow_failures:
14 | - php: 7
15 | - php: hhvm
16 | - php: hhvm-nightly
17 |
18 | before_script:
19 | - composer install --prefer-dist --dev
20 |
21 | script:
22 | - vendor/bin/phpunit -c tests/phpunit.xml tests/
23 | - vendor/bin/phpcs --standard=PSR2 -n library/ tests/
24 |
--------------------------------------------------------------------------------
/src/Utils/ColorSchemeGenerator.php:
--------------------------------------------------------------------------------
1 | namespace src\Utils;
2 |
3 | class ColorSchemeGenerator {
4 | public static function generate($numColors) {
5 | if ($numColors < 1) {
6 | return [];
7 | }
8 | $colors = [];
9 | for ($i = 0; $i < $numColors; $i++) {
10 | $hue = ($i * 360 / $numColors) % 360;
11 | $colors[] = "hsl(" . $hue . ", 100%, 50%)";
12 | }
13 | return $colors;
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/tests/input/unannotated_testvcf.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.1
2 | ##fileDate=20190527
3 | ##
4 | ##
5 | ##FORMAT=
6 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLEID
7 | 1 101 . A G . . . GT 0/0
8 | 1 102 . G C . . . GT 1/1
9 | 1 103 . G T . . . GT 0/0
10 | 1 104 . C T . . . GT 1/1
11 | 1 105 . C . . . . GT ./.
12 | 1 106 . G C . . . GT 0/1
13 | 1 107 . G T,C . . . GT 1/2
14 | 1 108 . A T . . . GT 0/1
15 | 1 109 . C T . . . GT 0/1
16 |
--------------------------------------------------------------------------------
/src/KitLoader.php:
--------------------------------------------------------------------------------
1 |
6 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLEID
7 | 1 101 rs1 A . . . . GT 0/0
8 | 1 102 rs2 . C . . . GT 1/1
9 | 1 103 rs3 G T . . . GT 0|0
10 | 1 104 rs4 C T . . . GT 1/1
11 | 1 105 rs5 C . . . . GT ./.
12 | 1 106 rs6 G C . . . GT 0/1
13 | 1 107 rs7 G T,C . . . GT 1/2
14 | 1 108 rs8 A T . . . GT 0/1
15 | 1 109 . C T . . . GT 0/1
16 | 1 110 rs10 A AGC . . . GT 0/1
17 | 1 111 rs11 AGC A . . . GT 0/1
18 | 1 112 rs12 . A . . . GT 0/1
19 | 1 113 rs13 . A . . . GT 1/0
20 | 1 114 rs14 A . . . . GT 0/1
21 | 1 115 rs15 A . . . . GT 1/0
22 | 1 116 rs16 A A . . . GT 0/.
23 | 1 117 rs17 A A . . . GT ./0
24 |
--------------------------------------------------------------------------------
/src/Helpers/CSVGenerator.php:
--------------------------------------------------------------------------------
1 |
6 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLEID
7 | 1 101 rs1 A . . . . GT 0|0
8 | 1 102 rs2 . C . . . GT 1|1
9 | 1 103 rs3 G T . . . GT 0|0
10 | 1 104 rs4 C T . . . GT 1|1
11 | 1 105 rs5 C . . . . GT ./.
12 | 1 106 rs6 G C . . . GT 0|1
13 | 1 107 rs7 G T,C . . . GT 1|2
14 | 1 108 rs8 A T . . . GT 0|1
15 | 1 109 . C T . . . GT 0|1
16 | 1 110 rs10 A AGC . . . GT 0|1
17 | 1 111 rs11 AGC A . . . GT 0|1
18 | 1 112 rs12 . A . . . GT 0|1
19 | 1 113 rs13 . A . . . GT 1|0
20 | 1 114 rs14 A . . . . GT 0|1
21 | 1 115 rs15 A . . . . GT 1|0
22 | 1 116 rs16 A A . . . GT 0|.
23 | 1 117 rs17 A A . . . GT .|0
24 |
--------------------------------------------------------------------------------
/tests/input/testvcf_chr_prefix.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.1
2 | ##fileDate=20190527
3 | ##
4 | ##
5 | ##FORMAT=
6 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLEID
7 | chr1 101 rs1 A . . . . GT 0/0
8 | chr1 102 rs2 . C . . . GT 1/1
9 | chr1 103 rs3 G T . . . GT 0|0
10 | chr1 104 rs4 C T . . . GT 1/1
11 | chr1 105 rs5 C . . . . GT ./.
12 | chr1 106 rs6 G C . . . GT 0/1
13 | chr1 107 rs7 G T,C . . . GT 1/2
14 | chr1 108 rs8 A T . . . GT 0/1
15 | chr1 109 . C T . . . GT 0/1
16 | chr1 110 rs10 A AGC . . . GT 0/1
17 | chr1 111 rs11 AGC A . . . GT 0/1
18 | chr1 112 rs12 . A . . . GT 0/1
19 | chr1 113 rs13 . A . . . GT 1/0
20 | chr1 114 rs14 A . . . . GT 0/1
21 | chr1 115 rs15 A . . . . GT 1/0
22 | chr1 116 rs16 A A . . . GT 0/.
23 | chr1 117 rs17 A A . . . GT ./0
24 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/sweep-template.yml:
--------------------------------------------------------------------------------
1 | name: Sweep Issue
2 | title: 'Sweep: '
3 | description: For small bugs, features, refactors, and tests to be handled by Sweep, an AI-powered junior developer.
4 | labels: sweep
5 | body:
6 | - type: textarea
7 | id: description
8 | attributes:
9 | label: Details
10 | description: Tell Sweep where and what to edit and provide enough context for a new developer to the codebase
11 | placeholder: |
12 | Unit Tests: Write unit tests for . Test each function in the file. Make sure to test edge cases.
13 | Bugs: The bug might be in . Here are the logs: ...
14 | Features: the new endpoint should use the ... class from because it contains ... logic.
15 | Refactors: We are migrating this function to ... version because ...
--------------------------------------------------------------------------------
/tests/input/testvcf_multi_sample.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.1
2 | ##fileDate=20190527
3 | ##
4 | ##
5 | ##FORMAT=
6 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLEID MULTISAMPLE
7 | 1 101 rs1 A . . . . GT 0/0 ./.
8 | 1 102 rs2 . C . . . GT 1/1 ./.
9 | 1 103 rs3 G T . . . GT 0|0 ./.
10 | 1 104 rs4 C T . . . GT 1/1 ./.
11 | 1 105 rs5 C . . . . GT ./. ./.
12 | 1 106 rs6 G C . . . GT 0/1 ./.
13 | 1 107 rs7 G T,C . . . GT 1/2 ./.
14 | 1 108 rs8 A T . . . GT 0/1 ./.
15 | 1 109 . C T . . . GT 0/1 ./.
16 | 1 110 rs10 A AGC . . . GT 0/1 ./.
17 | 1 111 rs11 AGC A . . . GT 0/1 ./.
18 | 1 112 rs12 . A . . . GT 0/1 ./.
19 | 1 113 rs13 . A . . . GT 1/0 ./.
20 | 1 114 rs14 A . . . . GT 0/1 ./.
21 | 1 115 rs15 A . . . . GT 1/0 ./.
22 | 1 116 rs16 A A . . . GT 0/. ./.
23 | 1 117 rs17 A A . . . GT ./0 ./.
24 |
--------------------------------------------------------------------------------
/src/Snps/IO/ExtraTabsFilter.php:
--------------------------------------------------------------------------------
1 | data = preg_replace('/\t+/', "\t", $bucket->data);
19 |
20 | // Remove trailing tabs at end of lines
21 | $bucket->data = preg_replace('/\t+\n/', "\n", $bucket->data);
22 | $bucket->data = preg_replace('/\t+\r\n/', "\r\n", $bucket->data);
23 |
24 | $consumed += $bucket->datalen;
25 | stream_bucket_append($out, $bucket);
26 | }
27 |
28 | return PSFS_PASS_ON;
29 | }
30 | }
--------------------------------------------------------------------------------
/CONTRIBUTE.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | Contributions are **welcome** and will be fully **credited**. We accept contributions via Pull Requests on [Github](https://github.com/familytree365/php-dna).
4 |
5 | ## Pull Requests
6 |
7 | - **[PSR-4 Coding Standard.]** The easiest way to apply the conventions is to install [PHP CS Fixer](https://github.com/FriendsOfPHP/PHP-CS-Fixer).
8 | - **Document any change in behaviour.** Make sure the `README.md` and any other relevant documentation are kept up-to-date.
9 | - **Create feature branches.** Don't ask us to pull from your master branch.
10 | - **One pull request per feature.** If you want to do more than one thing, send multiple pull requests.
11 | - **Send coherent history.** Make sure each individual commit in your pull request is meaningful. If you had to make multiple intermediate commits while developing, please [squash them](http://www.git-scm.com/book/en/v2/Git-Tools-Rewriting-History#Changing-Multiple-Commit-Messages) before submitting.
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "liberu-genealogy/php-dna",
3 | "description": "DNA analysis toolkit for PHP 8.4+",
4 | "type": "library",
5 | "keywords": ["dna", "genotype", "genealogy", "bioinformatics"],
6 | "homepage": "http://github.com/liberu-genealogy/php-dna",
7 | "license": "MIT",
8 | "require": {
9 | "php": "^8.4",
10 | "league/csv": "^9.0",
11 | "guzzlehttp/guzzle": "^7.8",
12 | "symfony/http-client": "^7.0",
13 | "ext-json": "*",
14 | "ext-zip": "*"
15 | },
16 | "require-dev": {
17 | "phpunit/phpunit": "^11.0",
18 | "squizlabs/php_codesniffer": "^3.8",
19 | "rector/rector": "^1.0",
20 | "phpstan/phpstan": "^1.10"
21 | },
22 | "autoload": {
23 | "psr-4": {
24 | "Dna\\": "src/"
25 | }
26 | },
27 | "autoload-dev": {
28 | "psr-4": {
29 | "DnaTest\\": "tests/"
30 | }
31 | },
32 | "config": {
33 | "sort-packages": true
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/Snps/Singleton.php:
--------------------------------------------------------------------------------
1 | parseFromText($fileContent);
17 | }
18 |
19 | public function parseFromText(string $rawData): SNPs
20 | {
21 | $snps = new SNPs();
22 | $lines = explode("\n", $rawData);
23 | foreach ($lines as $line) {
24 | if (empty($line) || $line[0] === '#') {
25 | continue;
26 | }
27 | $parts = explode("\t", $line);
28 | if (count($parts) < 3) {
29 | continue;
30 | }
31 | [$chromosome, $position, $genotype] = $parts;
32 | $snps->addSNP($chromosome, $position, $genotype);
33 | }
34 | return $snps;
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Family Tree 365
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/tests/input/mapmygenome_alt_header.txt:
--------------------------------------------------------------------------------
1 | SNP.Name Sample.ID Allele1...Top Allele2...Top GC.Score Sample.Name Sample.Group Sample.Index SNP.Index SNP.Aux Allele1...Forward Allele2...Forward Allele1...Design Allele2...Design Allele1...AB Allele2...AB Allele1...Plus Allele2...Plus Chr Position GT.Score Cluster.Sep SNP ILMN.Strand Customer.Strand Top.Genomic.Sequence Plus.Minus.Strand Theta R X Y X.Raw Y.Raw B.Allele.Freq Log.R.Ratio CNV.Value CNV.Confidence
2 | rs1 0 A A 0 NA NA 0 0 0 A A A A A A A A 1 101 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA
3 | rs2 0 C C 0 NA NA 0 0 0 A A A A A A C C 1 102 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA
4 | rs3 0 G G 0 NA NA 0 0 0 A A A A A A G G 1 103 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA
5 | rs4 0 T T 0 NA NA 0 0 0 A A A A A A T T 1 104 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA
6 | rs5 0 -- -- 0 NA NA 0 0 0 A A A A A A -- -- 1 105 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA
7 | rs6 0 G C 0 NA NA 0 0 0 A A A A A A G C 1 106 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA
8 | rs7 0 T C 0 NA NA 0 0 0 A A A A A A T C 1 107 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA
9 | rs8 0 A T 0 NA NA 0 0 0 A A A A A A A T 1 108 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA
10 |
--------------------------------------------------------------------------------
/tests/input/mapmygenome.txt:
--------------------------------------------------------------------------------
1 | SNP Name rsID Sample.ID Allele1...Top Allele2...Top GC.Score Sample.Name Sample.Group Sample.Index SNP.Index SNP.Aux Allele1...Forward Allele2...Forward Allele1...Design Allele2...Design Allele1...AB Allele2...AB Allele1...Plus Allele2...Plus Chr Position GT.Score Cluster.Sep SNP ILMN.Strand Customer.Strand Top.Genomic.Sequence Plus.Minus.Strand Theta R X Y X.Raw Y.Raw B.Allele.Freq Log.R.Ratio CNV.Value CNV.Confidence
2 | rs1 rs1 0 A A 0 NA NA 0 0 0 A A A A A A A A 1 101 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA
3 | rs2 rs2 0 C C 0 NA NA 0 0 0 A A A A A A C C 1 102 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA
4 | rs3 rs3 0 G G 0 NA NA 0 0 0 A A A A A A G G 1 103 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA
5 | rs4 rs4 0 T T 0 NA NA 0 0 0 A A A A A A T T 1 104 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA
6 | rs5 rs5 0 -- -- 0 NA NA 0 0 0 A A A A A A -- -- 1 105 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA
7 | rs6 rs6 0 G C 0 NA NA 0 0 0 A A A A A A G C 1 106 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA
8 | rs7 rs7 0 T C 0 NA NA 0 0 0 A A A A A A T C 1 107 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA
9 | rs8 rs8 0 A T 0 NA NA 0 0 0 A A A A A A A T 1 108 0 1 [A/A] TOP TOP NA + 0 0 0 0 0 0 0 0 NA NA
10 |
--------------------------------------------------------------------------------
/src/Dna.php:
--------------------------------------------------------------------------------
1 |
9 | * @copyright Copyright (c) 2020-2023, Liberu Software Ltd
10 | * @license MIT
11 | *
12 | * @link http://github.com/laravel-liberu/php-dna
13 | */
14 |
15 | namespace Dna;
16 |
17 | /**
18 | * Class Dna.
19 | */
20 | class Dna
21 | {
22 | /**
23 | * The directory where output files will be written.
24 | *
25 | * @var string
26 | */
27 | protected string $_outputDir;
28 |
29 | /**
30 | * The directory containing resource files used for DNA analysis.
31 | *
32 | * @var string
33 | */
34 | protected string $_resourcesDir;
35 |
36 | /**
37 | * Provides access to DNA resource files.
38 | *
39 | * @var \Dna\Resources
40 | */
41 | protected Resources $_resources;
42 |
43 | public function __construct(
44 | string $outputDirectory = 'output',
45 | string $resourcesDirectory = 'resources'
46 | ) {
47 | $this->_outputDir = $outputDirectory;
48 | $this->_resourcesDir = $resourcesDirectory;
49 | $this->_resources = Resources::getInstance();
50 | }
51 | }
--------------------------------------------------------------------------------
/src/Snps/IO/SnpFileReader.php:
--------------------------------------------------------------------------------
1 | resources = $resources;
16 | $this->ensemblRestClient = $ensemblRestClient;
17 | }
18 |
19 | public function readRawData(string $file, bool $only_detect_source = false, array $rsids = []): array
20 | {
21 | $reader = new Reader($file, $only_detect_source, $this->resources, $rsids);
22 | $data = $reader->read();
23 |
24 | return [
25 | 'snps' => $data['snps'],
26 | 'source' => $data['source'],
27 | 'phased' => $data['phased'],
28 | 'build' => $data['build'],
29 | ];
30 | }
31 |
32 | public function readFile(string $file): array
33 | {
34 | $data = $this->readRawData($file);
35 |
36 | if (!empty($data)) {
37 | // Further processing of the data if necessary
38 | // For example, sorting, deduplication, etc.
39 | }
40 |
41 | return $data;
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/Snps/Analysis/BuildDetector.php:
--------------------------------------------------------------------------------
1 | [36 => 742429, 37 => 752566, 38 => 817186],
11 | "rs11928389" => [36 => 50908372, 37 => 50927009, 38 => 50889578],
12 | "rs2500347" => [36 => 143649677, 37 => 144938320, 38 => 148946169],
13 | "rs964481" => [36 => 27566744, 37 => 27656823, 38 => 27638706],
14 | "rs2341354" => [36 => 908436, 37 => 918573, 38 => 983193],
15 | "rs3850290" => [36 => 22315141, 37 => 23245301, 38 => 22776092],
16 | "rs1329546" => [36 => 135302086, 37 => 135474420, 38 => 136392261],
17 | ];
18 |
19 | foreach ($snps as $snp) {
20 | foreach ($buildPositions as $rsid => $positions) {
21 | if ($snp['rsid'] === $rsid) {
22 | foreach ($positions as $build => $position) {
23 | if ($snp['pos'] === $position) {
24 | return $build;
25 | }
26 | }
27 | }
28 | }
29 | }
30 |
31 | return 0; // Default or unable to detect
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/Snps/IO/PythonDependency.php:
--------------------------------------------------------------------------------
1 | client = new Client();
38 | }
39 |
40 | public function fetchData(string $url): ?array
41 | {
42 | try {
43 | $response = $this->client->request('GET', $url);
44 | if ($response->getStatusCode() === 200) {
45 | return json_decode($response->getBody()->getContents(), true);
46 | }
47 | return null;
48 | } catch (Exception $e) {
49 | error_log("Failed to fetch data from {$url}: " . $e->getMessage());
50 | return null;
51 | }
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/src/Snps/Analysis/ClusterOverlapCalculator.php:
--------------------------------------------------------------------------------
1 | $clusterData) {
11 | $snpsInCluster = array_filter($snps, function ($snp) use ($clusterData) {
12 | return in_array($snp['chrom'], $clusterData['chromosomes']) && $snp['pos'] >= $clusterData['start'] && $snp['pos'] <= $clusterData['end'];
13 | });
14 |
15 | $snpsInCommon = count($snpsInCluster);
16 | $totalSnpsInCluster = count($clusterData['snps']);
17 | $overlapWithCluster = $snpsInCommon / $totalSnpsInCluster;
18 | $overlapWithSelf = $snpsInCommon / count($snps);
19 |
20 | if ($overlapWithCluster > $clusterOverlapThreshold && $overlapWithSelf > $clusterOverlapThreshold) {
21 | $overlapResults[$clusterId] = [
22 | 'overlapWithCluster' => $overlapWithCluster,
23 | 'overlapWithSelf' => $overlapWithSelf,
24 | 'snpsInCommon' => $snpsInCommon,
25 | 'totalSnpsInCluster' => $totalSnpsInCluster,
26 | ];
27 | }
28 | }
29 |
30 | return $overlapResults;
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/Snps/IO/DataParser.php:
--------------------------------------------------------------------------------
1 | namespace Dna\Snps\IO;
2 |
3 | class DataParser
4 | {
5 | public function __construct()
6 | {
7 | }
8 |
9 | public function parseFile($filePath)
10 | {
11 | $format = $this->detectFileFormat($filePath);
12 | switch ($format) {
13 | case '23andMe':
14 | return $this->parse23andMe($filePath);
15 | case 'AncestryDNA':
16 | return $this->parseAncestryDNA($filePath);
17 | case 'GSA':
18 | return $this->parseGSA($filePath);
19 | default:
20 | return $this->parseGeneric($filePath);
21 | }
22 | }
23 |
24 | private function detectFileFormat($filePath)
25 | {
26 | // Logic to detect file format based on file content or metadata
27 | }
28 |
29 | private function parse23andMe($filePath)
30 | {
31 | // Parsing logic for 23andMe files
32 | }
33 |
34 | private function parseAncestryDNA($filePath)
35 | {
36 | // Parsing logic for AncestryDNA files
37 | }
38 |
39 | private function parseGSA($filePath)
40 | {
41 | // Parsing logic for Illumina Global Screening Array files
42 | }
43 |
44 | private function parseGeneric($filePath)
45 | {
46 | // Parsing logic for generic CSV/TSV files
47 | }
48 |
49 | private function extractComments($filePath)
50 | {
51 | // Utility method to extract comments from files
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/src/Snps/SNPData.php:
--------------------------------------------------------------------------------
1 | setSnps($snps);
13 | }
14 |
15 | public function getSnps(): array
16 | {
17 | return $this->snps;
18 | }
19 |
20 | public function setSnps(array $snps): void
21 | {
22 | $this->snps = $snps;
23 | $this->keys = array_keys($snps);
24 | }
25 |
26 | public function count(): int
27 | {
28 | return count($this->snps);
29 | }
30 |
31 | public function filter(callable $callback): array
32 | {
33 | return array_filter($this->snps, $callback);
34 | }
35 |
36 | public function sort(): void
37 | {
38 | ksort($this->snps);
39 | $this->keys = array_keys($this->snps);
40 | }
41 |
42 | public function merge(SNPData $other): void
43 | {
44 | $this->snps = array_merge($this->snps, $other->getSnps());
45 | $this->keys = array_keys($this->snps);
46 | }
47 |
48 | public function getChromosomes(): array
49 | {
50 | return array_unique(array_column($this->snps, 'chrom'));
51 | }
52 |
53 | public function getSnpsByChromosome(string $chromosome): array
54 | {
55 | return array_filter($this->snps, function($snp) use ($chromosome) {
56 | return $snp['chrom'] === $chromosome;
57 | });
58 | }
59 | }
--------------------------------------------------------------------------------
/sweep.yaml:
--------------------------------------------------------------------------------
1 | # Sweep AI turns bugs & feature requests into code changes (https://sweep.dev)
2 | # For details on our config file, check out our docs at https://docs.sweep.dev/usage/config
3 |
4 | # This setting contains a list of rules that Sweep will check for. If any of these rules are broken in a new commit, Sweep will create an pull request to fix the broken rule.
5 | rules:
6 | - "All new business logic should have corresponding unit tests."
7 | - "Refactor large functions to be more modular."
8 | - "Add docstrings to all functions and file headers."
9 |
10 | # This is the branch that Sweep will develop from and make pull requests to. Most people use 'main' or 'master' but some users also use 'dev' or 'staging'.
11 | branch: 'main'
12 |
13 | # By default Sweep will read the logs and outputs from your existing Github Actions. To disable this, set this to false.
14 | gha_enabled: True
15 |
16 | # This is the description of your project. It will be used by sweep when creating PRs. You can tell Sweep what's unique about your project, what frameworks you use, or anything else you want.
17 | #
18 | # Example:
19 | #
20 | # description: sweepai/sweep is a python project. The main api endpoints are in sweepai/api.py. Write code that adheres to PEP8.
21 | description: ''
22 |
23 | # This sets whether to create pull requests as drafts. If this is set to True, then all pull requests will be created as drafts and GitHub Actions will not be triggered.
24 | draft: False
25 |
26 | # This is a list of directories that Sweep will not be able to edit.
27 | blocked_dirs: []
28 |
--------------------------------------------------------------------------------
/src/Snps/Resources.php:
--------------------------------------------------------------------------------
1 | baseUrl = $baseUrl;
16 | $this->localResourceDir = $localResourceDir;
17 | $this->httpClient = new Client();
18 | }
19 |
20 | public function downloadResource(string $url, string $destinationPath): void
21 | {
22 | $response = $this->httpClient->get($url);
23 | file_put_contents($destinationPath, $response->getBody());
24 | }
25 |
26 | public function loadDataFromFile(string $filePath)
27 | {
28 | return file_get_contents($filePath);
29 | }
30 |
31 | public function getReferenceSequence(string $id): ReferenceSequence
32 | {
33 | $filePath = $this->getLocalPathForResource($id);
34 | $sequenceData = $this->loadDataFromFile($filePath);
35 | return new ReferenceSequence($id, $sequenceData);
36 | }
37 |
38 | public function getAssemblyMappingData(string $id)
39 | {
40 | // Implementation for fetching assembly mapping data
41 | }
42 |
43 | public function getExampleDataset(string $id)
44 | {
45 | // Implementation for fetching example datasets
46 | }
47 |
48 | private function getLocalPathForResource(string $resourceId): string
49 | {
50 | return $this->localResourceDir . DIRECTORY_SEPARATOR . $resourceId;
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/src/Snps/SortTest.php:
--------------------------------------------------------------------------------
1 | valid()) {
28 | for ($x = 0; $x < $cols; $x++) {
29 | $key = $iter->key();
30 | $obj = $iter->current();
31 | $out .= sprintf($patt, $key, $obj->name, $obj->id);
32 | $iter->next();
33 | if (!$iter->valid()) break;
34 | }
35 | }
36 | return $out . "\n";
37 | }
38 | /**
39 | * Builds consistent array of Test instances
40 | *
41 | * @return array $arr
42 | */
43 | public static function build()
44 | {
45 | $arr = [];
46 | $maxNames = count(self::$names);
47 | $pos = 0;
48 | for ($x = 0; $x < self::$max; $x++) {
49 | // note that the ID value == the order assigned
50 | $key = strtoupper(dechex($x));
51 | $id = sprintf('%04d', $x + 1000);
52 | if ($pos >= $maxNames) $pos = 0;
53 | $name = self::$names[$pos++];
54 | $test = new self();
55 | $test->id = $id;
56 | $test->name = $name;
57 | $arr[$key] = $test;
58 | }
59 | return $arr;
60 | }
61 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # php-dna
2 |
3 | ## Running MatchKits from the Command Line
4 |
5 | To run the MatchKits script from the command line, navigate to the root directory of the php-dna project.
6 |
7 | Ensure you have PHP installed on your system. You can check this by running `php -v` in your command line. If PHP is not installed, please install it from the official PHP website.
8 |
9 | Execute the script by running the following command: `php src/MatchKits.php`.
10 |
11 | The script will prompt you to enter the file paths for Kit 1 and Kit 2. Please enter the full path for each file when prompted.
12 |
13 | After entering the file paths, the script will process the data and generate a matched data visualization. The output file named 'matched_data.png' will be saved in the root directory.
14 |
15 | ## Requirements
16 |
17 | * php-dna 1.0+ requires PHP 8.3 (or later).
18 | * php-dna 2.0+ requires PHP 8.4 (or later).
19 |
20 | ## Installation
21 |
22 | There are two ways of installing php-dna.
23 |
24 | ### Composer
25 |
26 | To install php-dna in your project using composer, simply add the following require line to your project's `composer.json` file:
27 |
28 | {
29 | "require": {
30 | "liberu-genealogy/php-dna": "1.0.*"
31 | }
32 | }
33 |
34 | ### Download and __autoload
35 |
36 | If you are not using composer, you can download an archive of the source from GitHub and extract it into your project. You'll need to setup an autoloader for the files, unless you go through the painstaking process if requiring all the needed files one-by-one. Something like the following should suffice:
37 |
38 | ```php
39 | spl_autoload_register(function ($class) {
40 | $pathToDna = __DIR__ . '/library/'; // TODO FIXME
41 |
42 | if (!substr(ltrim($class, '\\'), 0, 7) == 'Dna\\') {
43 | return;
44 | }
45 |
46 | $class = str_replace('\\', DIRECTORY_SEPARATOR, $class) . '.php';
47 | if (file_exists($pathToDna . $class)) {
48 | require_once($pathToDna . $class);
49 | }
50 | });
51 | ```
52 |
--------------------------------------------------------------------------------
/src/Snps/ReferenceSequence.php:
--------------------------------------------------------------------------------
1 | id;
27 | }
28 |
29 | /**
30 | * Get the URL
31 | */
32 | public function getUrl(): string
33 | {
34 | return $this->url;
35 | }
36 |
37 | /**
38 | * Get the file path
39 | */
40 | public function getPath(): string
41 | {
42 | return $this->path;
43 | }
44 |
45 | /**
46 | * Get the assembly
47 | */
48 | public function getAssembly(): string
49 | {
50 | return $this->assembly;
51 | }
52 |
53 | /**
54 | * Get the species
55 | */
56 | public function getSpecies(): string
57 | {
58 | return $this->species;
59 | }
60 |
61 | /**
62 | * Get the taxonomy
63 | */
64 | public function getTaxonomy(): string
65 | {
66 | return $this->taxonomy;
67 | }
68 |
69 | /**
70 | * Check if the reference sequence file exists
71 | */
72 | public function exists(): bool
73 | {
74 | return file_exists($this->path);
75 | }
76 |
77 | /**
78 | * Get the size of the reference sequence file
79 | */
80 | public function getSize(): int
81 | {
82 | return $this->exists() ? filesize($this->path) : 0;
83 | }
84 |
85 | /**
86 | * String representation
87 | */
88 | public function __toString(): string
89 | {
90 | return "ReferenceSequence(id='{$this->id}', assembly='{$this->assembly}', species='{$this->species}')";
91 | }
92 | }
--------------------------------------------------------------------------------
/src/Snps/SNPAnalyzer.php:
--------------------------------------------------------------------------------
1 | buildDetector = $buildDetector;
18 | $this->clusterOverlapCalculator = $clusterOverlapCalculator;
19 | }
20 |
21 | public function detectBuild(SNPData $snpData): int
22 | {
23 | return $this->buildDetector->detectBuild($snpData->getSnps());
24 | }
25 |
26 | public function computeClusterOverlap(SNPData $snpData, float $threshold = 0.95): array
27 | {
28 | return $this->clusterOverlapCalculator->computeClusterOverlap($snpData->getSnps(), $threshold);
29 | }
30 |
31 | public function determineSex(SNPData $snpData): string
32 | {
33 | $xSnps = $snpData->getSnpsByChromosome('X');
34 | $ySnps = $snpData->getSnpsByChromosome('Y');
35 |
36 | if (empty($xSnps) && empty($ySnps)) {
37 | return '';
38 | }
39 |
40 | $xHeterozygous = $this->countHeterozygous($xSnps);
41 | $yNonNull = $this->countNonNull($ySnps);
42 |
43 | if ($xHeterozygous / count($xSnps) > 0.03) {
44 | return 'Female';
45 | } elseif ($yNonNull / count($ySnps) > 0.3) {
46 | return 'Male';
47 | }
48 |
49 | return '';
50 | }
51 |
52 | private function countHeterozygous(array $snps): int
53 | {
54 | return count(array_filter($snps, function($snp) {
55 | return strlen($snp['genotype']) === 2 && $snp['genotype'][0] !== $snp['genotype'][1];
56 | }));
57 | }
58 |
59 | private function countNonNull(array $snps): int
60 | {
61 | return count(array_filter($snps, function($snp) {
62 | return $snp['genotype'] !== null;
63 | }));
64 | }
65 | }
--------------------------------------------------------------------------------
/src/Snps/IO/CsvReader.php:
--------------------------------------------------------------------------------
1 | filePath = $filePath;
16 | $this->separator = ',';
17 | $this->header = false;
18 | $this->columnNames = [];
19 | $this->columnTypes = [];
20 | }
21 |
22 | public function setSeparator($separator)
23 | {
24 | $this->separator = $separator;
25 | }
26 |
27 | public function setHeader($header)
28 | {
29 | $this->header = $header;
30 | }
31 |
32 | public function setColumnNames($columnNames)
33 | {
34 | $this->columnNames = $columnNames;
35 | }
36 |
37 | public function setColumnTypes($columnTypes)
38 | private $enclosure;
39 |
40 | public function setEnclosure($enclosure)
41 | {
42 | $this->enclosure = $enclosure;
43 | }
44 | {
45 | $this->columnTypes = $columnTypes;
46 | }
47 |
48 | public function read()
49 | {
50 | $data = [];
51 |
52 | if (($handle = fopen($this->filePath, "r")) !== false) {
53 | if ($this->header) {
54 | fgetcsv($handle, 0, $this->separator, $this->enclosure); // Skip the header row
55 | }
56 |
57 | while (($row = fgetcsv($handle, 0, $this->separator, $this->enclosure)) !== false) {
58 | if (!empty($this->columnNames)) {
59 | print_r($row);
60 | $row = array_combine($this->columnNames, $row);
61 | }
62 |
63 | if (!empty($this->columnTypes)) {
64 | foreach ($this->columnTypes as $column => $type) {
65 | settype($row[$column], $type);
66 | }
67 | }
68 |
69 | $data[] = $row;
70 | }
71 |
72 | fclose($handle);
73 | }
74 |
75 | return $data;
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/src/Snps/Ensembl.php:
--------------------------------------------------------------------------------
1 | last_req = microtime(true);
23 | }
24 |
25 | public function performRestAction(
26 | string $endpoint,
27 | array $headers = [],
28 | array $params = []
29 | ): ?array {
30 | $headers['Content-Type'] ??= 'application/json';
31 |
32 | $this->rateLimit();
33 |
34 | try {
35 | $response = $this->makeRequest($endpoint, $headers, $params);
36 | return $this->handleResponse($response);
37 | } catch (TransportExceptionInterface $e) {
38 | error_log("Request failed for {$endpoint}: " . $e->getMessage());
39 | return null;
40 | }
41 | }
42 |
43 | private function makeRequest(string $endpoint, array $headers, array $params): ResponseInterface
44 | {
45 | $client = HttpClient::create();
46 | return $client->request('GET', "{$this->server}{$endpoint}", [
47 | 'headers' => $headers,
48 | 'query' => $params,
49 | ]);
50 | }
51 |
52 | private function rateLimit(): void
53 | {
54 | if ($this->req_count >= $this->reqs_per_sec) {
55 | $delta = microtime(true) - $this->last_req;
56 | if ($delta < 1) {
57 | usleep((1 - $delta) * 1000000);
58 | }
59 | $this->last_req = microtime(true);
60 | $this->req_count = 0;
61 | } else {
62 | $this->req_count++;
63 | }
64 | }
65 | }
66 | ?>
67 |
--------------------------------------------------------------------------------
/src/Snps/ReferenceSequenceManager.php:
--------------------------------------------------------------------------------
1 | init_resource_attributes();
15 | }
16 |
17 | private function init_resource_attributes(): void
18 | {
19 | $this->_reference_sequences = [];
20 | }
21 |
22 | public function getReferenceSequences(string $assembly = "GRCh37", array $chroms = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "MT"]): array
23 | {
24 | if (!in_array($assembly, $this->validAssemblies)) {
25 | error_log("Invalid assembly");
26 | return [];
27 | }
28 |
29 | if (!$this->referenceChromsAvailable($assembly, $chroms)) {
30 | // Placeholder for logic to fetch paths and URLs for reference sequences
31 | $urls = [];
32 | $paths = [];
33 | $this->_reference_sequences[$assembly] = $this->createReferenceSequences($assembly, $chroms, $urls, $paths);
34 | }
35 |
36 | return $this->_reference_sequences[$assembly];
37 | }
38 |
39 | private function referenceChromsAvailable(string $assembly, array $chroms): bool
40 | {
41 | // Placeholder for actual availability check logic
42 | return false;
43 | }
44 |
45 | protected function createReferenceSequences(string $assembly, array $chroms, array $urls, array $paths): array
46 | {
47 | $seqs = [];
48 |
49 | foreach ($paths as $i => $path) {
50 | if (!$path) {
51 | continue;
52 | }
53 |
54 | $seqs[$chroms[$i]] = new ReferenceSequence(
55 | $chroms[$i],
56 | $urls[$i],
57 | realpath($path),
58 | $assembly,
59 | "Homo sapiens",
60 | "x"
61 | );
62 | }
63 |
64 | return $seqs;
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/tests/input/discrepant_snps.csv:
--------------------------------------------------------------------------------
1 | rsid,chromosome,position_file1,position_file2,genotype_file1,genotype_file2,discrepant_position,discrepant_genotype,expected_position,expected_genotype
2 | rs1,1,1,1,--,--,False,False,1,--
3 | rs2,1,2,2,--,AA,False,False,2,AA
4 | rs3,1,3,3,AA,--,False,False,3,AA
5 | rs4,1,4,4,AA,AA,False,False,4,AA
6 | rs5,1,5,5,--,--,False,False,5,--
7 | rs6,1,6,6,--,AB,False,False,6,AB
8 | rs7,1,7,7,AB,--,False,False,7,AB
9 | rs8,1,8,8,AB,AB,False,False,8,AB
10 | rs9,1,9,9,AB,BA,False,False,9,AB
11 | rs10,1,10,10,BA,AB,False,False,10,BA
12 | rs11,1,11,11,BA,BA,False,False,11,BA
13 | rs12,1,12,12,AB,AC,False,True,12,--
14 | rs13,1,13,13,AB,CA,False,True,13,--
15 | rs14,1,14,14,BA,AC,False,True,14,--
16 | rs15,1,15,15,BA,CA,False,True,15,--
17 | rs16,1,16,16,AB,CD,False,True,16,--
18 | rs17,1,17,17,AB,DC,False,True,17,--
19 | rs18,1,18,18,BA,CD,False,True,18,--
20 | rs19,1,19,19,BA,DC,False,True,19,--
21 | rs20,MT,20,20,--,--,False,False,20,--
22 | rs21,MT,21,21,--,A,False,False,21,A
23 | rs22,MT,22,22,A,--,False,False,22,A
24 | rs23,MT,23,23,A,A,False,False,23,A
25 | rs24,MT,24,24,A,B,False,True,24,--
26 | rs25,MT,25,25,B,A,False,True,25,--
27 | rs26,MT,26,26,B,B,False,False,26,B
28 | rs27,1,27,1,--,--,True,False,27,--
29 | rs28,1,28,2,--,AA,True,False,28,AA
30 | rs29,1,29,3,AA,--,True,False,29,AA
31 | rs30,1,30,4,AA,AA,True,False,30,AA
32 | rs31,1,31,5,--,--,True,False,31,--
33 | rs32,1,32,6,--,AB,True,False,32,AB
34 | rs33,1,33,7,AB,--,True,False,33,AB
35 | rs34,1,34,8,AB,AB,True,False,34,AB
36 | rs35,1,35,9,AB,BA,True,False,35,AB
37 | rs36,1,36,10,BA,AB,True,False,36,BA
38 | rs37,1,37,11,BA,BA,True,False,37,BA
39 | rs38,1,38,12,AB,AC,True,True,38,--
40 | rs39,1,39,13,AB,CA,True,True,39,--
41 | rs40,1,40,14,BA,AC,True,True,40,--
42 | rs41,1,41,15,BA,CA,True,True,41,--
43 | rs42,1,42,16,AB,CD,True,True,42,--
44 | rs43,1,43,17,AB,DC,True,True,43,--
45 | rs44,1,44,18,BA,CD,True,True,44,--
46 | rs45,1,45,19,BA,DC,True,True,45,--
47 | rs46,MT,46,20,--,--,True,False,46,--
48 | rs47,MT,47,21,--,A,True,False,47,A
49 | rs48,MT,48,22,A,--,True,False,48,A
50 | rs49,MT,49,23,A,A,True,False,49,A
51 | rs50,MT,50,24,A,B,True,True,50,--
52 | rs51,MT,51,25,B,A,True,True,51,--
53 | rs52,MT,52,26,B,B,True,False,52,B
54 |
--------------------------------------------------------------------------------
/src/Snps/DatasetDownloader.php:
--------------------------------------------------------------------------------
1 | declare(strict_types=1);
2 |
3 | namespace Dna\Snps;
4 |
5 | use GuzzleHttp\Client;
6 | use GuzzleHttp\Exception\GuzzleException;
7 | use League\Csv\Reader;
8 |
9 | final class DatasetDownloader
10 | {
11 | public function __construct(
12 | private readonly Client $httpClient = new Client(),
13 | private readonly string $cacheDir = __DIR__ . '/../../cache'
14 | ) {
15 | if (!is_dir($this->cacheDir)) {
16 | mkdir($this->cacheDir, 0755, true);
17 | }
18 | }
19 |
20 | /**
21 | * @return array
22 | * @throws GuzzleException
23 | */
24 | public function downloadExampleDatasets(): array
25 | {
26 | return [
27 | $this->downloadFile("https://opensnp.org/data/662.23andme.340", "662.23andme.340.txt.gz"),
28 | $this->downloadFile("https://opensnp.org/data/662.ftdna-illumina.341", "662.ftdna-illumina.341.csv.gz")
29 | ];
30 | }
31 |
32 | public function getAllResources()
33 | {
34 | $resources = [];
35 | $resources["gsa_resources"] = $this->getGsaResources();
36 | $resources["chip_clusters"] = $this->get_chip_clusters();
37 | $resources["low_quality_snps"] = $this->getLowQualitySNPs();
38 | return $resources;
39 | }
40 |
41 | public function getGsaResources(): array
42 | {
43 | // Implementation similar to SNPsResources::getGsaResources
44 | }
45 |
46 | public function get_chip_clusters()
47 | {
48 | // Implementation similar to SNPsResources::get_chip_clusters
49 | }
50 |
51 | public function getLowQualitySNPs(): array
52 | {
53 | // Implementation similar to SNPsResources::getLowQualitySNPs
54 | }
55 |
56 | public function get_dbsnp_151_37_reverse(): ?array
57 | {
58 | // Implementation similar to SNPsResources::get_dbsnp_151_37_reverse
59 | }
60 |
61 | public function getOpensnpDatadumpFilenames(): array
62 | {
63 | // Implementation similar to SNPsResources::getOpensnpDatadumpFilenames
64 | }
65 |
66 | private function download_file(string $url, string $filename, bool $compress = false): string
67 | {
68 | // Implementation similar to SNPsResources::download_file
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/src/Snps/DocBlockChecker.php:
--------------------------------------------------------------------------------
1 | target = $target;
15 | $this->reflect = new ReflectionClass($target);
16 | }
17 | /**
18 | * Check methods for docBlocks
19 | * If one exists but no @param, adds them
20 | * If no docBlock, creates one
21 | *
22 | * @return array $methods : method name => docBlock
23 | */
24 | public function check()
25 | {
26 | // get methods
27 | $methods = [];
28 | $list = $this->reflect->getMethods();
29 | foreach ($list as $refMeth) {
30 | // get docbock
31 | $docBlock = $refMeth->getDocComment();
32 | if (!$docBlock) {
33 | $docBlock = "/**\n * " . $refMeth->getName() . "\n";
34 | // get params
35 | $params = $refMeth->getParameters();
36 | if ($params) {
37 | foreach ($params as $refParm) {
38 | $type = $refParm->getType() ?? 'mixed';
39 | $type = (string) $type;
40 | $name = $refParm->getName();
41 | $default = '';
42 | if (!$refParm->isVariadic() && $refParm->isOptional())
43 | $default = $refParm->getDefaultValue();
44 | if ($default === '') $default = "(empty string)";
45 | $docBlock .= " * @param $type \${$name} : $default\n";
46 | }
47 | }
48 | // get return type
49 | if ($refMeth->isConstructor())
50 | $return = 'void';
51 | else
52 | $return = $refMeth->getReturnType() ?? 'mixed';
53 | $docBlock .= " * @return $return\n";
54 | $docBlock .= " */\n";
55 | }
56 | $methods[$refMeth->getName()] = $docBlock;
57 | }
58 | return $methods;
59 | }
60 | }
--------------------------------------------------------------------------------
/src/Snps/IO/PhpDataFrame.php:
--------------------------------------------------------------------------------
1 | columns = array_keys($data[0]);
14 | $this->data = $data;
15 | }
16 | }
17 |
18 | public static function fromFile(string $filePath): self
19 | {
20 | $rows = array_map('str_getcsv', file($filePath));
21 | $columns = array_shift($rows);
22 | $data = array_map(fn($row) => array_combine($columns, $row), $rows);
23 | return new self($data);
24 | }
25 |
26 | public function addRow(array $row): void
27 | {
28 | $this->data[] = $row;
29 | }
30 |
31 | public function removeRow(int $index): void
32 | {
33 | array_splice($this->data, $index, 1);
34 | }
35 |
36 | public function addColumn(string $columnName, array $values): void
37 | {
38 | foreach ($this->data as $index => $row) {
39 | $this->data[$index][$columnName] = $values[$index] ?? null;
40 | }
41 | if (!in_array($columnName, $this->columns)) {
42 | $this->columns[] = $columnName;
43 | }
44 | }
45 |
46 | public function removeColumn(string $columnName): void
47 | {
48 | foreach ($this->data as $index => $row) {
49 | unset($this->data[$index][$columnName]);
50 | }
51 | $this->columns = array_filter($this->columns, fn($column) => $column !== $columnName);
52 | }
53 |
54 | public function filter(callable $callback): self
55 | {
56 | $filteredData = array_filter($this->data, $callback);
57 | return new self(array_values($filteredData));
58 | }
59 |
60 | public function sum(string $columnName): float
61 | {
62 | return array_sum(array_column($this->data, $columnName));
63 | }
64 |
65 | public function average(string $columnName): float
66 | {
67 | $columnData = array_column($this->data, $columnName);
68 | return array_sum($columnData) / count($columnData);
69 | }
70 |
71 | public function toArray(): array
72 | {
73 | return $this->data;
74 | }
75 |
76 | public function getRow(int $index): ?array
77 | {
78 | return $this->data[$index] ?? null;
79 | }
80 |
81 | public function getColumn(string $columnName): array
82 | {
83 | return array_column($this->data, $columnName);
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/src/Individual.php:
--------------------------------------------------------------------------------
1 | declare(strict_types=1);
2 |
3 | namespace Dna;
4 |
5 | use Dna\Snps\SNPs;
6 | use ReflectionMethod;
7 |
8 | final class Individual extends SNPs
9 | {
10 | public function __construct(
11 | private readonly string $name,
12 | private readonly mixed $rawData = [],
13 | private readonly array $kwargs = []
14 | ) {
15 | $snpsConstructorArgs = $this->getDefinedKwargs(
16 | new ReflectionMethod(SNPs::class, '__construct'),
17 | $kwargs
18 | );
19 |
20 | parent::__construct(...array_values($snpsConstructorArgs));
21 |
22 | $this->processRawData();
23 | }
24 |
25 | private function processRawData(): void
26 | {
27 | $rawDataArray = is_array($this->rawData) ? $this->rawData : [$this->rawData];
28 |
29 | foreach ($rawDataArray as $data) {
30 | $snps = $this->createSnpsObject($data);
31 | $this->merge([$snps]);
32 | }
33 | }
34 |
35 | /**
36 | * Get the string representation of the Individual
37 | *
38 | * @return string The string representation
39 | */
40 | public function __toString(): string
41 | {
42 | return sprintf("Individual('%s')", $this->name);
43 | }
44 |
45 | /**
46 | * Get the Individual's name
47 | *
48 | * @return string The name
49 | */
50 | public function getName(): string
51 | {
52 | return $this->name;
53 | }
54 |
55 | /**
56 | * Get a variable-safe version of the Individual's name
57 | *
58 | * @return string The variable-safe name
59 | */
60 | public function getVarName(): string
61 | {
62 | return $this->clean_str($this->name);
63 | }
64 |
65 | /**
66 | * Clean a string to make it variable-safe
67 | *
68 | * @param string $str The string to clean
69 | * @return string The cleaned string
70 | */
71 | private function clean_str(string $str): string
72 | {
73 | // Remove special characters and replace with underscores
74 | $cleaned = preg_replace('/[^a-zA-Z0-9_]/', '_', $str);
75 |
76 | // Remove multiple consecutive underscores
77 | $cleaned = preg_replace('/_+/', '_', $cleaned);
78 |
79 | // Remove leading/trailing underscores
80 | $cleaned = trim($cleaned, '_');
81 |
82 | // Ensure it doesn't start with a number
83 | if (is_numeric(substr($cleaned, 0, 1))) {
84 | $cleaned = 'var_' . $cleaned;
85 | }
86 |
87 | return $cleaned ?: 'unnamed';
88 | }
89 | }
--------------------------------------------------------------------------------
/tests/IndividualTest.php:
--------------------------------------------------------------------------------
1 | assertEquals('John Doe', $individual->getName());
17 | $this->assertEquals("Individual('John Doe')", (string)$individual);
18 | }
19 |
20 | public function testVarNameGeneration(): void
21 | {
22 | $individual = new Individual('John Doe-Smith');
23 | $varName = $individual->getVarName();
24 |
25 | // Should convert special characters to underscores
26 | $this->assertEquals('John_Doe_Smith', $varName);
27 | }
28 |
29 | public function testVarNameWithNumbers(): void
30 | {
31 | $individual = new Individual('123Test');
32 | $varName = $individual->getVarName();
33 |
34 | // Should prefix with 'var_' if starts with number
35 | $this->assertEquals('var_123Test', $varName);
36 | }
37 |
38 | public function testVarNameWithSpecialCharacters(): void
39 | {
40 | $individual = new Individual('Test@#$%Name');
41 | $varName = $individual->getVarName();
42 |
43 | // Should replace special characters with underscores
44 | $this->assertEquals('Test_Name', $varName);
45 | }
46 |
47 | public function testVarNameEmpty(): void
48 | {
49 | $individual = new Individual('');
50 | $varName = $individual->getVarName();
51 |
52 | // Should return 'unnamed' for empty string
53 | $this->assertEquals('unnamed', $varName);
54 | }
55 |
56 | public function testVarNameOnlySpecialChars(): void
57 | {
58 | $individual = new Individual('@#$%');
59 | $varName = $individual->getVarName();
60 |
61 | // Should return 'unnamed' when only special characters
62 | $this->assertEquals('unnamed', $varName);
63 | }
64 |
65 | public function testIndividualWithRawData(): void
66 | {
67 | $rawData = [
68 | 'rs123' => ['rsid' => 'rs123', 'chrom' => '1', 'pos' => 1000, 'genotype' => 'AA'],
69 | 'rs456' => ['rsid' => 'rs456', 'chrom' => '2', 'pos' => 2000, 'genotype' => 'AT'],
70 | ];
71 |
72 | $individual = new Individual('Test Individual', $rawData);
73 |
74 | $this->assertEquals('Test Individual', $individual->getName());
75 | $this->assertTrue($individual->isValid());
76 | $this->assertEquals(2, $individual->count());
77 | }
78 | }
--------------------------------------------------------------------------------
/src/Snps/AssemblyMappingManager.php:
--------------------------------------------------------------------------------
1 | declare(strict_types=1);
2 |
3 | namespace Dna\Snps;
4 |
5 | use PharData;
6 | use GuzzleHttp\Client;
7 | use GuzzleHttp\Exception\GuzzleException;
8 |
9 | final class AssemblyMappingManager
10 | {
11 | public function __construct(
12 | private readonly Client $httpClient = new Client(),
13 | private readonly string $resourcePath = __DIR__ . "/resources"
14 | ) {
15 | if (!is_dir($this->resourcePath)) {
16 | mkdir($this->resourcePath, 0755, true);
17 | }
18 | }
19 |
20 | /**
21 | * @throws Exception
22 | */
23 | public function getAssemblyMappingData(string $sourceAssembly, string $targetAssembly): string
24 | {
25 | $filename = "assembly_mapping_{$sourceAssembly}_to_{$targetAssembly}.tar.gz";
26 | $filepath = "{$this->resourcePath}/{$filename}";
27 |
28 | if (!file_exists($filepath)) {
29 | return $this->downloadMappingData($sourceAssembly, $targetAssembly, $filepath);
30 | }
31 |
32 | return $filepath;
33 | }
34 |
35 | public static function loadAssemblyMappingData(string $filename): array
36 | {
37 | $assemblyMappingData = [];
38 | try {
39 | $tar = new PharData($filename);
40 | foreach ($tar as $file) {
41 | if (strpos($file->getFilename(), '.json') !== false) {
42 | $content = file_get_contents($file->getPathname());
43 | $data = json_decode($content, true);
44 | if (json_last_error() === JSON_ERROR_NONE) {
45 | $assemblyMappingData[] = $data;
46 | } else {
47 | throw new Exception("Error parsing JSON data.");
48 | }
49 | }
50 | }
51 | } catch (Exception $e) {
52 | throw new Exception("Error loading assembly mapping data: " . $e->getMessage());
53 | }
54 |
55 | return $assemblyMappingData;
56 | }
57 |
58 | /**
59 | * @throws GuzzleException
60 | */
61 | private function downloadMappingData(string $sourceAssembly, string $targetAssembly, string $filepath): void
62 | {
63 | $url = "http://example.com/assembly_mapping/{$sourceAssembly}/{$targetAssembly}";
64 | $response = $this->httpClient->get($url);
65 | if ($response->getStatusCode() === 200) {
66 | file_put_contents($filepath, $response->getBody()->getContents());
67 | } else {
68 | throw new GuzzleException("Failed to download assembly mapping data.");
69 | }
70 | }
71 | }
--------------------------------------------------------------------------------
/phpconvcount.py:
--------------------------------------------------------------------------------
1 | import re
2 | import ast
3 |
4 | pycodefile = '../Projects/geneology/snps/tests/test_snps.py'
5 | phpcodefile = 'tests/Snps/SnpsTest.php'
6 |
7 |
8 | def normalize_function_name(name):
9 | # Check if the name is already in camelCase with mixed case
10 | if any(c.islower() and name[i+1:i+2].isupper() for i, c in enumerate(name[:-1])):
11 | return name
12 | # Handle snake_case to camelCase conversion
13 | name_parts = name.split('_')
14 | name = name_parts[0] + ''.join(word.strip().capitalize() for word in name_parts[1:])
15 | return name
16 |
17 | def get_function_names_in_class(python_code, class_name):
18 | # Parse the Python code using the ast module
19 | parsed_code = ast.parse(python_code)
20 |
21 | # Initialize variables to track function names
22 | function_names = []
23 |
24 | # Helper function to extract function names from a class node
25 | def extract_function_names(class_node):
26 | names = []
27 | for node in ast.walk(class_node):
28 | if isinstance(node, ast.FunctionDef):
29 | names.append(node.name)
30 | return names
31 |
32 | # Traverse the parsed code and extract function names within the specified class
33 | for node in ast.walk(parsed_code):
34 | if isinstance(node, ast.ClassDef) and node.name == class_name:
35 | function_names.extend(extract_function_names(node))
36 |
37 | # Return the list of function names
38 | return function_names
39 |
40 |
41 | # Step 1: Read Python Code from the File
42 | with open(pycodefile, 'r') as python_file:
43 | python_code = python_file.read()
44 |
45 | # Step 2: Extract Functions within the TestSnps Class
46 | # Extract function names from the TestSnps class
47 | python_functions = get_function_names_in_class(python_code, "TestSnps")
48 |
49 | # Step 3: Normalize Python Function Names
50 | normalized_python_functions = list(set(normalize_function_name(func) for func in python_functions))
51 |
52 | # Step 4: Read PHP Code from the File
53 | with open(phpcodefile, 'r') as php_file:
54 | php_code = php_file.read()
55 |
56 | # Step 5: Extract PHP Function Names
57 | php_functions = re.findall(r'(public|private|protected) function ([a-zA-Z_][a-zA-Z0-9_]*)\(', php_code)
58 |
59 | php_functions = [name for (visibility, name) in php_functions]
60 |
61 |
62 | # Step 6: Normalize PHP Function Names
63 | normalized_php_functions = [normalize_function_name(func) for func in php_functions]
64 |
65 | # Step 7: Compare Python and PHP Function Names
66 | missing_functions = set(normalized_python_functions) - set(normalized_php_functions)
67 | extra_functions = set(normalized_php_functions) - set(normalized_python_functions)
68 |
69 | # Count of functions in Python and PHP
70 | python_function_count = len(normalized_python_functions)
71 | php_function_count = len(normalized_php_functions)
72 |
73 | # Print the count of functions
74 | print("Number of Functions in Python:", python_function_count)
75 | print("Number of Functions in PHP:", php_function_count)
76 |
77 | # print(normalized_python_functions)
78 |
79 | # Print missing functions in PHP compared to Python
80 | print("\nMissing Functions in PHP:")
81 | for func in missing_functions:
82 | print(func)
83 |
84 |
85 | print("\nExtra Functions in PHP:")
86 | for func in extra_functions:
87 | print(func)
--------------------------------------------------------------------------------
/src/Triangulation.php:
--------------------------------------------------------------------------------
1 | getMessage());
25 | }
26 | }
27 |
28 | /**
29 | * Validate input kits data
30 | *
31 | * @param array $kitsData Array of SNPs objects
32 | * @throws Exception If input is invalid
33 | */
34 | private static function validateInput(array $kitsData): void {
35 | if (count($kitsData) < 3) {
36 | throw new Exception("At least three DNA kits are required for triangulation.");
37 | }
38 | foreach ($kitsData as $kit) {
39 | if (!$kit instanceof SNPs) {
40 | throw new Exception("Invalid input: All elements must be instances of SNPs class.");
41 | }
42 | }
43 | }
44 |
45 | /**
46 | * Extract SNP lists from kits data
47 | *
48 | * @param SNPs[] $kitsData Array of SNPs objects
49 | * @return array Array of SNP lists
50 | */
51 | private static function extractSnpLists(array $kitsData): array {
52 | return array_map(function(SNPs $kit) { return $kit->getSnps(); }, $kitsData);
53 | }
54 |
55 | /**
56 | * Find common SNPs across all kits
57 | *
58 | * @param array $snpsLists Array of SNP lists
59 | * @return array Common SNPs
60 | */
61 | private static function findCommonSnps(array $snpsLists): array {
62 | return call_user_func_array('array_intersect_key', $snpsLists);
63 | }
64 |
65 | /**
66 | * Filter non-common SNPs
67 | *
68 | * @param array $commonSnps Array of common SNPs
69 | * @param SNPs[] $kitsData Array of SNPs objects
70 | * @return array Filtered common SNPs
71 | */
72 | private static function filterNonCommonSnps(array $commonSnps, array $kitsData): array {
73 | return array_filter($commonSnps, function($snp) use ($kitsData) {
74 | return self::isSnpCommonAcrossAllKits($snp, $kitsData);
75 | });
76 | }
77 |
78 | /**
79 | * Check if SNP is common across all kits
80 | *
81 | * @param array $snp SNP to check
82 | * @param SNPs[] $kitsData Array of SNPs objects
83 | * @return bool True if SNP is common across all kits, false otherwise
84 | */
85 | private static function isSnpCommonAcrossAllKits(array $snp, array $kitsData): bool {
86 | return count(array_filter($kitsData, function(SNPs $kit) use ($snp) {
87 | $snps = $kit->getSnps();
88 | return isset($snps[$snp['pos']]) && $snps[$snp['pos']]['genotype'] === $snp['genotype'];
89 | })) === count($kitsData);
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/src/Snps/Utils.php:
--------------------------------------------------------------------------------
1 |
11 | * @copyright Copyright (c) 2020-2023, Devmanateam
12 | * @license MIT
13 | *
14 | * @link http://github.com/familytree365/php-dna
15 | */
16 |
17 | namespace Dna\Snps;
18 |
19 | use ZipArchive;
20 | use Exception;
21 |
22 | /**
23 | * Utils class provides utility functions for file manipulation, parallel processing,
24 | * and other common tasks. It includes methods for gzipping files, creating directories,
25 | * fetching current UTC time, saving data as CSV, cleaning strings, and zipping files.
26 | */
27 | final class Utils
28 | {
29 | public static function gzipFile(string $src, string $dest): string
30 | {
31 | /**
32 | * Gzip a file.
33 | *
34 | * @param string $src Path to file to gzip
35 | * @param string $dest Path to output gzip file
36 | *
37 | * @return string Path to gzipped file
38 | */
39 |
40 | if (!is_readable($src)) {
41 | throw new Exception("Cannot read source file: {$src}");
42 | }
43 |
44 | $srcFile = fopen($src, "rb");
45 | $gzFile = gzopen($dest, "wb9"); // Maximum compression
46 |
47 | try {
48 | stream_copy_to_stream($srcFile, $gzFile);
49 | return $dest;
50 | } finally {
51 | fclose($srcFile);
52 | gzclose($gzFile);
53 | }
54 | }
55 | /**
56 | * Creates a directory if it doesn't exist.
57 | *
58 | * @param string $path Path to the directory to create.
59 | * @return void
60 | */
61 | public static function createDir(string $path): void
62 | {
63 | if (!file_exists($path)) {
64 | mkdir($path, 0755, true);
65 | }
66 | }
67 |
68 | /**
69 | * Gets the current UTC time.
70 | *
71 | * @return string Current UTC time in 'Y-m-d H:i:s' format.
72 | */
73 | public static function getUtcNow(): string
74 | {
75 | return (new \DateTimeImmutable('now', new \DateTimeZone('UTC')))
76 | ->format('Y-m-d H:i:s');
77 | }
78 |
79 | /**
80 | * Saves data as a CSV file.
81 | *
82 | * @param array $data Data to save.
83 | * @param string $filename Path to the CSV file.
84 | * @return void
85 | */
86 | public static function saveDfAsCsv(array $data, string $filename): void
87 | {
88 | $fp = fopen($filename, 'w');
89 | foreach ($data as $row) {
90 | fputcsv($fp, $row);
91 | }
92 | fclose($fp);
93 | }
94 |
95 | /**
96 | * Cleans a string to be used as a variable name.
97 | *
98 | * @param string $str String to clean.
99 | * @return string Cleaned string.
100 | */
101 | public static function cleanStr(string $str): string
102 | {
103 | return preg_replace('/[^A-Za-z0-9_]/', '', $str);
104 | }
105 |
106 | /**
107 | * Zips a file.
108 | *
109 | * @param string $src Path to the file to zip.
110 | * @param string $dest Path to the output zip file.
111 | * @return void
112 | */
113 | public static function zipFile(string $src, string $dest): void
114 | {
115 | $zip = new ZipArchive();
116 | if ($zip->open($dest, ZipArchive::CREATE) === TRUE) {
117 | $zip->addFile($src, basename($src));
118 | $zip->close();
119 | }
120 | }
121 | }
--------------------------------------------------------------------------------
/src/Snps/EnsemblRestClient.php:
--------------------------------------------------------------------------------
1 | server = $server;
25 | $this->reqsPerSec = $reqsPerSec;
26 | $this->client = new Client([
27 | 'base_uri' => $server,
28 | 'timeout' => 30,
29 | 'headers' => [
30 | 'Content-Type' => 'application/json',
31 | 'Accept' => 'application/json'
32 | ]
33 | ]);
34 | }
35 |
36 | /**
37 | * Perform a REST API action with rate limiting
38 | *
39 | * @param string $endpoint The API endpoint to call
40 | * @param array $params Query parameters
41 | * @return array|null The decoded JSON response or null on error
42 | */
43 | public function perform_rest_action(string $endpoint, array $params = []): ?array
44 | {
45 | $this->rateLimit();
46 |
47 | try {
48 | $response = $this->client->get($endpoint, [
49 | 'query' => $params
50 | ]);
51 |
52 | if ($response->getStatusCode() === 200) {
53 | $body = $response->getBody()->getContents();
54 | return json_decode($body, true);
55 | }
56 | } catch (GuzzleException $e) {
57 | error_log("REST API error: " . $e->getMessage());
58 | }
59 |
60 | return null;
61 | }
62 |
63 | /**
64 | * Rate limiting to respect API limits
65 | */
66 | private function rateLimit(): void
67 | {
68 | $currentTime = microtime(true);
69 | $timeSinceLastRequest = $currentTime - $this->lastRequestTime;
70 | $minInterval = 1.0 / $this->reqsPerSec;
71 |
72 | if ($timeSinceLastRequest < $minInterval) {
73 | $sleepTime = $minInterval - $timeSinceLastRequest;
74 | usleep((int)($sleepTime * 1000000));
75 | }
76 |
77 | $this->lastRequestTime = microtime(true);
78 | }
79 |
80 | /**
81 | * Get assembly mapping data from Ensembl
82 | *
83 | * @param string $species Species name (e.g., 'human')
84 | * @param string $fromAssembly Source assembly
85 | * @param string $toAssembly Target assembly
86 | * @param string $region Genomic region
87 | * @return array|null Mapping data or null on error
88 | */
89 | public function getAssemblyMapping(
90 | string $species,
91 | string $fromAssembly,
92 | string $toAssembly,
93 | string $region
94 | ): ?array {
95 | $endpoint = "/map/{$species}/{$fromAssembly}/{$region}/{$toAssembly}";
96 | return $this->perform_rest_action($endpoint);
97 | }
98 |
99 | /**
100 | * Lookup RefSNP snapshot from NCBI
101 | *
102 | * @param string $rsid The rs ID (without 'rs' prefix)
103 | * @return array|null RefSNP data or null on error
104 | */
105 | public function lookupRefsnpSnapshot(string $rsid): ?array
106 | {
107 | $id = str_replace("rs", "", $rsid);
108 | return $this->perform_rest_action("/variation/v0/refsnp/" . $id);
109 | }
110 | }
--------------------------------------------------------------------------------
/src/Snps/PythonDependency.php:
--------------------------------------------------------------------------------
1 | data = $data;
14 | $this->columns = $columns;
15 | }
16 |
17 | public function filter(callable $callback) {
18 | $filteredData = array_filter($this->data, $callback);
19 | return new self($filteredData, $this->columns);
20 | }
21 |
22 | public function merge(DataFrame $other, string $joinType = 'inner', ?string $on = null) {
23 | // Implement the logic to merge two DataFrames based on the join type and column(s)
24 | // Example implementation:
25 | $mergedData = [];
26 |
27 | foreach ($this->data as $row1) {
28 | foreach ($other->data as $row2) {
29 | if ($on !== null && $row1[$on] === $row2[$on]) {
30 | $mergedRow = array_merge($row1, $row2);
31 | $mergedData[] = $mergedRow;
32 | } elseif ($on === null) {
33 | $mergedRow = array_merge($row1, $row2);
34 | $mergedData[] = $mergedRow;
35 | }
36 | }
37 | }
38 |
39 | return new self($mergedData, array_merge($this->columns, $other->columns));
40 | }
41 |
42 | public function select(array $columns) {
43 | // Implement the logic to select a subset of columns from the DataFrame
44 | // Example implementation:
45 | $selectedData = [];
46 |
47 | foreach ($this->data as $row) {
48 | $selectedRow = [];
49 | foreach ($columns as $column) {
50 | $selectedRow[$column] = $row[$column];
51 | }
52 | $selectedData[] = $selectedRow;
53 | }
54 |
55 | return new self($selectedData, $columns);
56 | }
57 |
58 | public function dropDuplicates() {
59 | // Implement the logic to remove duplicate rows from the DataFrame
60 | // Example implementation:
61 | $uniqueData = [];
62 |
63 | foreach ($this->data as $row) {
64 | if (!in_array($row, $uniqueData)) {
65 | $uniqueData[] = $row;
66 | }
67 | }
68 |
69 | return new self($uniqueData, $this->columns);
70 | }
71 | }
72 |
73 | class SNPAnalysis {
74 | public function calculateAlleleFrequencies(DataFrame $snps) {
75 | // Implement the logic to calculate allele frequencies for the given SNPs data
76 | // Example implementation:
77 | $alleleFrequencies = [];
78 |
79 | foreach ($snps->data as $snp) {
80 | $alleles = str_split($snp['genotype']);
81 | foreach ($alleles as $allele) {
82 | if (!isset($alleleFrequencies[$allele])) {
83 | $alleleFrequencies[$allele] = 0;
84 | }
85 | $alleleFrequencies[$allele]++;
86 | }
87 | }
88 |
89 | $totalAlleles = array_sum($alleleFrequencies);
90 | foreach ($alleleFrequencies as &$frequency) {
91 | $frequency /= $totalAlleles;
92 | }
93 |
94 | return $alleleFrequencies;
95 | }
96 |
97 | public function detectSNPBuild(DataFrame $snps) {
98 | // Implement the logic to detect the SNP build based on the given SNPs data
99 | // Example implementation:
100 | $buildCounts = [];
101 |
102 | foreach ($snps->data as $snp) {
103 | $build = $snp['build'];
104 | if (!isset($buildCounts[$build])) {
105 | $buildCounts[$build] = 0;
106 | }
107 | $buildCounts[$build]++;
108 | }
109 |
110 | $maxCount = 0;
111 | $detectedBuild = null;
112 | foreach ($buildCounts as $build => $count) {
113 | if ($count > $maxCount) {
114 | $maxCount = $count;
115 | $detectedBuild = $build;
116 | }
117 | }
118 |
119 | return $detectedBuild;
120 | }
121 | }
122 |
123 | class MathOperations {
124 | public function calculateStandardDeviation(array $data) {
125 | return Average::standardDeviation($data);
126 | }
127 |
128 | public function createMatrix(array $data) {
129 | return MatrixFactory::create($data);
130 | }
131 | }
132 |
--------------------------------------------------------------------------------
/src/Visualization.php:
--------------------------------------------------------------------------------
1 | validateInput($data, $filename, $format);
34 | $chromosomeCollections = $this->chromosomeCollections($data);
35 | $chromosomalFeatures = $this->patchChromosomalFeatures($data);
36 |
37 | // Visualization code...
38 | // (Implement the actual visualization logic here)
39 |
40 | $this->generateCSV($data, $filename);
41 | } catch (Exception $e) {
42 | throw new Exception("Error generating visualization: " . $e->getMessage());
43 | }
44 | }
45 |
46 | /**
47 | * Validate input data for visualization
48 | *
49 | * @param array $data The SNP data to plot
50 | * @param string $filename The filename for the generated plot
51 | * @param string $format The image format for the plot
52 | * @throws Exception If input is invalid
53 | */
54 | private function validateInput(array $data, string $filename, string $format): void
55 | {
56 | if (empty($data)) {
57 | throw new Exception("No data provided for visualization.");
58 | }
59 | if (empty($filename)) {
60 | throw new Exception("Filename is required for visualization output.");
61 | }
62 | if (!in_array(strtolower($format), ['png', 'jpeg', 'jpg'])) {
63 | throw new Exception("Invalid image format. Please use png, jpeg, or jpg.");
64 | }
65 | }
66 |
67 | /**
68 | * Generate chromosome collections for visualization
69 | *
70 | * @param array $data The SNP data
71 | * @return array Chromosome collections
72 | */
73 | private function chromosomeCollections(array $data): array
74 | {
75 | $collections = [];
76 | $yPositions = $this->calculateYPositions($data);
77 | $height = 1; // Adjust as needed
78 |
79 | foreach ($data as $chrom => $group) {
80 | $yrange = [$yPositions[$chrom], $height];
81 | $xranges = [];
82 | foreach ($group as $snp) {
83 | $xranges[] = ['start' => $snp['pos'], 'width' => 1]; // Assuming SNP position is a single point
84 | }
85 | $collections[] = ['xranges' => $xranges, 'yrange' => $yrange, 'colors' => $this->generateColorScheme(count($group))];
86 | }
87 | return $collections;
88 | }
89 |
90 | /**
91 | * Calculate Y positions for chromosomes
92 | *
93 | * @param array $data The SNP data
94 | * @return array Y positions for each chromosome
95 | */
96 | private function calculateYPositions(array $data): array
97 | {
98 | $yPositions = [];
99 | $currentY = 0;
100 | foreach (array_keys($data) as $chrom) {
101 | $yPositions[$chrom] = $currentY;
102 | $currentY += 2; // Adjust spacing as needed
103 | }
104 | return $yPositions;
105 | }
106 |
107 | /**
108 | * Patch chromosomal features for visualization
109 | *
110 | * @param array $data The SNP data
111 | * @return array Patched chromosomal features
112 | */
113 | private function patchChromosomalFeatures(array $data): array
114 | {
115 | $features = [];
116 | foreach ($data as $chromosome => $snps) {
117 | $chromosomeLength = max(array_column($snps, 'pos'));
118 | $features[$chromosome][] = ['start' => 0, 'end' => $chromosomeLength, 'gie_stain' => 'gneg'];
119 | foreach ($snps as $snp) {
120 | $features[$chromosome][] = [
121 | 'start' => $snp['pos'],
122 | 'end' => $snp['pos'] + 1,
123 | 'gie_stain' => 'snp'
124 | ];
125 | }
126 | }
127 | return $features;
128 | }
129 |
130 | /**
131 | * Generate CSV file from matched data
132 | *
133 | * @param array $matchedData The matched SNP data
134 | * @param string $filename The filename for the generated plot
135 | */
136 | private function generateCSV(array $matchedData, string $filename): void
137 | {
138 | $csvPath = str_replace(['.png', '.jpeg', '.jpg'], '.csv', $filename);
139 | CSVGenerator::generate($matchedData, $csvPath);
140 | }
141 |
142 | /**
143 | * Generate color scheme for visualization
144 | *
145 | * @param int $numColors Number of colors to generate
146 | * @return array Array of color strings
147 | */
148 | private function generateColorScheme(int $numColors): array
149 | {
150 | $colors = [];
151 | for ($i = 0; $i < $numColors; $i++) {
152 | $hue = $i * (360 / $numColors);
153 | $colors[] = "hsl(" . $hue . ", 100%, 50%)";
154 | }
155 | return $colors;
156 | }
157 | }
158 |
--------------------------------------------------------------------------------
/src/Snps/IO/Writer.php:
--------------------------------------------------------------------------------
1 | snps = $config['snps'];
27 | $this->filename = $config['filename'] ?? '';
28 | $this->vcf = $config['vcf'] ?? false;
29 | $this->atomic = $config['atomic'] ?? true;
30 | $this->vcfAltUnavailable = $config['vcf_alt_unavailable'] ?? '.';
31 | $this->vcfChromPrefix = $config['vcf_chrom_prefix'] ?? '';
32 | $this->vcfQcOnly = $config['vcf_qc_only'] ?? false;
33 | $this->vcfQcFilter = $config['vcf_qc_filter'] ?? false;
34 | $this->kwargs = $kwargs;
35 | }
36 |
37 | /**
38 | * Write SNPs data to file
39 | *
40 | * @return array [path, extra_data]
41 | */
42 | public function write(): array
43 | {
44 | if (empty($this->filename)) {
45 | $this->filename = $this->generateFilename();
46 | }
47 |
48 | $snpsData = $this->snps->getSnps();
49 |
50 | if (empty($snpsData)) {
51 | return ['', []];
52 | }
53 |
54 | if ($this->vcf) {
55 | return $this->writeVcf($snpsData);
56 | } else {
57 | return $this->writeCsv($snpsData);
58 | }
59 | }
60 |
61 | /**
62 | * Write data as CSV/TSV format
63 | */
64 | private function writeCsv(array $snpsData): array
65 | {
66 | $separator = $this->kwargs['sep'] ?? "\t";
67 | $path = $this->ensureExtension($this->filename, $separator === ',' ? '.csv' : '.tsv');
68 |
69 | $handle = fopen($path, 'w');
70 | if (!$handle) {
71 | return ['', []];
72 | }
73 |
74 | // Write header
75 | fputcsv($handle, ['rsid', 'chrom', 'pos', 'genotype'], $separator);
76 |
77 | // Write data
78 | foreach ($snpsData as $rsid => $snp) {
79 | fputcsv($handle, [
80 | $rsid,
81 | $snp['chrom'] ?? '',
82 | $snp['pos'] ?? '',
83 | $snp['genotype'] ?? ''
84 | ], $separator);
85 | }
86 |
87 | fclose($handle);
88 | return [$path, []];
89 | }
90 |
91 | /**
92 | * Write data as VCF format
93 | */
94 | private function writeVcf(array $snpsData): array
95 | {
96 | $path = $this->ensureExtension($this->filename, '.vcf');
97 |
98 | $handle = fopen($path, 'w');
99 | if (!$handle) {
100 | return ['', []];
101 | }
102 |
103 | // Write VCF header
104 | $this->writeVcfHeader($handle);
105 |
106 | $discrepantPositions = [];
107 |
108 | // Write data
109 | foreach ($snpsData as $rsid => $snp) {
110 | $vcfLine = $this->formatVcfLine($rsid, $snp);
111 | if ($vcfLine) {
112 | fwrite($handle, $vcfLine . "\n");
113 | } else {
114 | $discrepantPositions[] = $snp;
115 | }
116 | }
117 |
118 | fclose($handle);
119 | return [$path, $discrepantPositions];
120 | }
121 |
122 | /**
123 | * Write VCF header
124 | */
125 | private function writeVcfHeader($handle): void
126 | {
127 | fwrite($handle, "##fileformat=VCFv4.2\n");
128 | fwrite($handle, "##source=php-dna\n");
129 | fwrite($handle, "##assembly=" . $this->snps->getAssembly() . "\n");
130 | fwrite($handle, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n");
131 | }
132 |
133 | /**
134 | * Format a single VCF line
135 | */
136 | private function formatVcfLine(string $rsid, array $snp): ?string
137 | {
138 | $chrom = $this->vcfChromPrefix . ($snp['chrom'] ?? '');
139 | $pos = $snp['pos'] ?? '';
140 | $genotype = $snp['genotype'] ?? '';
141 |
142 | if (empty($chrom) || empty($pos) || empty($genotype)) {
143 | return null;
144 | }
145 |
146 | // Simple VCF format - in a full implementation, this would need
147 | // reference genome lookup for REF/ALT alleles
148 | $ref = strlen($genotype) > 0 ? $genotype[0] : 'N';
149 | $alt = strlen($genotype) > 1 ? $genotype[1] : $this->vcfAltUnavailable;
150 |
151 | if ($ref === $alt) {
152 | $alt = $this->vcfAltUnavailable;
153 | }
154 |
155 | $qual = '.';
156 | $filter = $this->vcfQcFilter ? 'PASS' : '.';
157 | $info = '.';
158 | $format = 'GT';
159 | $sample = $ref === $alt ? '0/0' : '0/1';
160 |
161 | return implode("\t", [$chrom, $pos, $rsid, $ref, $alt, $qual, $filter, $info, $format, $sample]);
162 | }
163 |
164 | /**
165 | * Generate a filename if none provided
166 | */
167 | private function generateFilename(): string
168 | {
169 | $timestamp = date('Y-m-d_H-i-s');
170 | $extension = $this->vcf ? '.vcf' : '.tsv';
171 | return "snps_output_{$timestamp}{$extension}";
172 | }
173 |
174 | /**
175 | * Ensure filename has the correct extension
176 | */
177 | private function ensureExtension(string $filename, string $extension): string
178 | {
179 | if (!str_ends_with($filename, $extension)) {
180 | $filename .= $extension;
181 | }
182 | return $filename;
183 | }
184 | }
--------------------------------------------------------------------------------
/tests/SNPsTest.php:
--------------------------------------------------------------------------------
1 | snps = new SNPs();
18 | }
19 |
20 | public function testEmptySNPs(): void
21 | {
22 | $this->assertFalse($this->snps->isValid());
23 | $this->assertEquals(0, $this->snps->count());
24 | $this->assertEmpty($this->snps->getSnps());
25 | }
26 |
27 | public function testSetSnps(): void
28 | {
29 | $testData = [
30 | 'rs123' => ['rsid' => 'rs123', 'chrom' => '1', 'pos' => 1000, 'genotype' => 'AA'],
31 | 'rs456' => ['rsid' => 'rs456', 'chrom' => '2', 'pos' => 2000, 'genotype' => 'AT'],
32 | ];
33 |
34 | $this->snps->setSnps($testData);
35 |
36 | $this->assertTrue($this->snps->isValid());
37 | $this->assertEquals(2, $this->snps->count());
38 | $this->assertEquals($testData, $this->snps->getSnps());
39 | }
40 |
41 | public function testBuildDetection(): void
42 | {
43 | // Test with known SNP positions for build detection
44 | $testData = [
45 | 'rs3094315' => ['rsid' => 'rs3094315', 'chrom' => '1', 'pos' => 752566, 'genotype' => 'AA'], // Build 37
46 | ];
47 |
48 | $this->snps->setSnps($testData);
49 | $build = $this->snps->detect_build();
50 |
51 | $this->assertEquals(37, $build);
52 | }
53 |
54 | public function testSexDetermination(): void
55 | {
56 | // Test female determination (heterozygous X chromosome)
57 | $femaleData = [
58 | 'rs1' => ['rsid' => 'rs1', 'chrom' => 'X', 'pos' => 1000, 'genotype' => 'AT'],
59 | 'rs2' => ['rsid' => 'rs2', 'chrom' => 'X', 'pos' => 2000, 'genotype' => 'CG'],
60 | ];
61 |
62 | $this->snps->setSnps($femaleData);
63 | $sex = $this->snps->determine_sex();
64 | $this->assertEquals('Female', $sex);
65 |
66 | // Test male determination (homozygous X chromosome)
67 | $maleData = [
68 | 'rs3' => ['rsid' => 'rs3', 'chrom' => 'X', 'pos' => 3000, 'genotype' => 'AA'],
69 | 'rs4' => ['rsid' => 'rs4', 'chrom' => 'X', 'pos' => 4000, 'genotype' => 'TT'],
70 | ];
71 |
72 | $this->snps->setSnps($maleData);
73 | $sex = $this->snps->determine_sex();
74 | $this->assertEquals('Male', $sex);
75 | }
76 |
77 | public function testChromosomeCounting(): void
78 | {
79 | $testData = [
80 | 'rs1' => ['rsid' => 'rs1', 'chrom' => '1', 'pos' => 1000, 'genotype' => 'AA'],
81 | 'rs2' => ['rsid' => 'rs2', 'chrom' => '1', 'pos' => 2000, 'genotype' => 'AT'],
82 | 'rs3' => ['rsid' => 'rs3', 'chrom' => 'X', 'pos' => 3000, 'genotype' => 'AA'],
83 | ];
84 |
85 | $this->snps->setSnps($testData);
86 |
87 | $this->assertEquals(2, $this->snps->get_count('1'));
88 | $this->assertEquals(1, $this->snps->get_count('X'));
89 | $this->assertEquals(3, $this->snps->get_count());
90 | }
91 |
92 | public function testHeterozygousHomozygous(): void
93 | {
94 | $testData = [
95 | 'rs1' => ['rsid' => 'rs1', 'chrom' => '1', 'pos' => 1000, 'genotype' => 'AA'], // Homozygous
96 | 'rs2' => ['rsid' => 'rs2', 'chrom' => '1', 'pos' => 2000, 'genotype' => 'AT'], // Heterozygous
97 | 'rs3' => ['rsid' => 'rs3', 'chrom' => '1', 'pos' => 3000, 'genotype' => 'TT'], // Homozygous
98 | ];
99 |
100 | $this->snps->setSnps($testData);
101 |
102 | $heterozygous = $this->snps->heterozygous('1');
103 | $homozygous = $this->snps->homozygous('1');
104 |
105 | $this->assertEquals(1, count($heterozygous));
106 | $this->assertEquals(2, count($homozygous));
107 | $this->assertArrayHasKey('rs2', $heterozygous);
108 | $this->assertArrayHasKey('rs1', $homozygous);
109 | $this->assertArrayHasKey('rs3', $homozygous);
110 | }
111 |
112 | public function testSorting(): void
113 | {
114 | $testData = [
115 | 'rs3' => ['rsid' => 'rs3', 'chrom' => '2', 'pos' => 1000, 'genotype' => 'AA'],
116 | 'rs1' => ['rsid' => 'rs1', 'chrom' => '1', 'pos' => 2000, 'genotype' => 'AT'],
117 | 'rs2' => ['rsid' => 'rs2', 'chrom' => '1', 'pos' => 1000, 'genotype' => 'TT'],
118 | 'rs4' => ['rsid' => 'rs4', 'chrom' => 'X', 'pos' => 500, 'genotype' => 'GG'],
119 | ];
120 |
121 | $this->snps->setSnps($testData);
122 | $this->snps->sort();
123 |
124 | $sortedSnps = $this->snps->getSnps();
125 | $keys = array_keys($sortedSnps);
126 |
127 | // Should be sorted by chromosome then position
128 | // Expected order: rs2 (chr1:1000), rs1 (chr1:2000), rs3 (chr2:1000), rs4 (chrX:500)
129 | $this->assertEquals('rs2', $keys[0]);
130 | $this->assertEquals('rs1', $keys[1]);
131 | $this->assertEquals('rs3', $keys[2]);
132 | $this->assertEquals('rs4', $keys[3]);
133 | }
134 |
135 | public function testGetAssembly(): void
136 | {
137 | $this->snps->setBuild(37);
138 | $this->assertEquals('GRCh37', $this->snps->getAssembly());
139 |
140 | $this->snps->setBuild(38);
141 | $this->assertEquals('GRCh38', $this->snps->getAssembly());
142 |
143 | $this->snps->setBuild(36);
144 | $this->assertEquals('NCBI36', $this->snps->getAssembly());
145 | }
146 |
147 | public function testGetSummary(): void
148 | {
149 | $testData = [
150 | 'rs1' => ['rsid' => 'rs1', 'chrom' => '1', 'pos' => 1000, 'genotype' => 'AA'],
151 | 'rs2' => ['rsid' => 'rs2', 'chrom' => 'X', 'pos' => 2000, 'genotype' => 'AT'],
152 | ];
153 |
154 | $this->snps->setSnps($testData);
155 | $this->snps->setBuild(37);
156 |
157 | $summary = $this->snps->getSummary();
158 |
159 | $this->assertIsArray($summary);
160 | $this->assertEquals('GRCh37', $summary['assembly']);
161 | $this->assertEquals(37, $summary['build']);
162 | $this->assertEquals(2, $summary['count']);
163 | $this->assertArrayHasKey('chromosomes', $summary);
164 | $this->assertArrayHasKey('sex', $summary);
165 | }
166 | }
--------------------------------------------------------------------------------
/src/MatchKits.php:
--------------------------------------------------------------------------------
1 | visualization = $visualization;
44 | $this->triangulation = $triangulation;
45 | }
46 |
47 | /**
48 | * Match the loaded DNA kits
49 | *
50 | * @throws Exception If less than two kits are loaded
51 | */
52 | public function matchKits(): void
53 | {
54 | if (count($this->kitsData) < 2) {
55 | throw new Exception("At least two DNA kits are required for matching.");
56 | }
57 |
58 | $this->matchedData = []; // Reset matched data
59 |
60 | try {
61 | foreach ($this->kitsData[0]->getSnps() as $snp1) {
62 | foreach ($this->kitsData[1]->getSnps() as $snp2) {
63 | if ($snp1['pos'] === $snp2['pos'] && $snp1['genotype'] === $snp2['genotype']) {
64 | $this->matchedData[] = $snp1;
65 | }
66 | }
67 | }
68 | } catch (Exception $e) {
69 | throw new Exception("Error matching DNA kits: " . $e->getMessage());
70 | }
71 | }
72 |
73 | /**
74 | * @return array The matched SNP data
75 | */
76 | public function getMatchedData(): array
77 | {
78 | return $this->matchedData;
79 | }
80 |
81 | /**
82 | * Load DNA kit data
83 | *
84 | * @param SNPs[] $kitsData The kit data to load
85 | * @throws Exception If the input is not an array of SNPs objects
86 | */
87 | public function setKitsData(array $kitsData): void
88 | {
89 | foreach ($kitsData as $kit) {
90 | if (!$kit instanceof SNPs) {
91 | throw new Exception("Invalid input: All elements must be instances of SNPs class.");
92 | }
93 | }
94 | $this->kitsData = $kitsData;
95 | }
96 |
97 | /**
98 | * Triangulate kits
99 | *
100 | * @throws Exception If less than three kits are loaded
101 | */
102 | public function triangulateKits(): void
103 | {
104 | if (count($this->kitsData) < 3) {
105 | throw new Exception("At least three DNA kits are required for triangulation.");
106 | }
107 |
108 | try {
109 | $this->matchedData = $this->triangulation->compareMultipleKits($this->kitsData);
110 | } catch (Exception $e) {
111 | throw new Exception("Error triangulating DNA kits: " . $e->getMessage());
112 | }
113 | }
114 |
115 | /**
116 | * Visualize matched data
117 | *
118 | * @param string $filename The filename for the generated plot
119 | * @param string $title The title for the plot
120 | * @param string $build The genome build version
121 | * @param string $format The image format for the plot
122 | * @throws Exception If visualization fails
123 | */
124 | public function visualizeMatchedData(string $filename, string $title, string $build, string $format): void
125 | {
126 | if (empty($this->matchedData)) {
127 | throw new Exception("No matched data to visualize. Run matchKits() or triangulateKits() first.");
128 | }
129 |
130 | try {
131 | $this->visualization->plotChromosomes($this->matchedData, $filename, $title, $build, $format);
132 | } catch (Exception $e) {
133 | throw new Exception("Error visualizing matched data: " . $e->getMessage());
134 | }
135 | }
136 | }
137 |
138 | if (php_sapi_name() == "cli") {
139 | try {
140 | $visualization = new Visualization();
141 | $triangulation = new Triangulation();
142 | $matchKits = new MatchKits($visualization, $triangulation);
143 |
144 | echo "Enter the number of kits to compare (2 for matching, 3 or more for triangulation): ";
145 | $numKits = intval(trim(fgets(STDIN)));
146 |
147 | if ($numKits < 2) {
148 | throw new Exception("At least two kits are required for comparison.");
149 | }
150 |
151 | $kitPaths = [];
152 | for ($i = 0; $i < $numKits; $i++) {
153 | echo "Enter file path for Kit " . ($i + 1) . ": ";
154 | $path = trim(fgets(STDIN));
155 | if (!file_exists($path)) {
156 | throw new Exception("File not found: $path");
157 | }
158 | $kitPaths[] = $path;
159 | }
160 |
161 | $kitsData = array_map(function($path) {
162 | return new SNPs($path);
163 | }, $kitPaths);
164 |
165 | $matchKits->setKitsData($kitsData);
166 |
167 | if ($numKits == 2) {
168 | $matchKits->matchKits();
169 | echo "Matching kits...\n";
170 | } else {
171 | $matchKits->triangulateKits();
172 | echo "Triangulating kits...\n";
173 | }
174 |
175 | echo "Enter filename for the visualization (e.g., matched_data.png): ";
176 | $filename = trim(fgets(STDIN));
177 |
178 | echo "Enter title for the plot: ";
179 | $title = trim(fgets(STDIN));
180 |
181 | echo "Enter genome build version (e.g., GRCh37): ";
182 | $build = trim(fgets(STDIN));
183 |
184 | echo "Enter image format (png, jpeg, or jpg): ";
185 | $format = strtolower(trim(fgets(STDIN)));
186 |
187 | if (!in_array($format, ['png', 'jpeg', 'jpg'])) {
188 | throw new Exception("Invalid image format. Please use png, jpeg, or jpg.");
189 | }
190 |
191 | $matchKits->visualizeMatchedData($filename, $title, $build, $format);
192 |
193 | echo "Matched data visualization has been generated: $filename\n";
194 | } catch (Exception $e) {
195 | echo "Error: " . $e->getMessage() . "\n";
196 | exit(1);
197 | }
198 | }
199 | ?>
200 | public function triangulateKits() {
201 | $this->matchedData = []; // Initialize matched data array
202 | $snpsLists = array_map(function($kit) { return $kit->getSnps(); }, $this->kitsData);
203 | $commonSnps = call_user_func_array('array_intersect_key', $snpsLists);
204 | foreach ($commonSnps as $snp) {
205 | $this->matchedData[] = $snp; // Add common SNP to matched data
206 | }
207 | }
208 |
--------------------------------------------------------------------------------
/tests/Snps/IO/ReaderTest.php:
--------------------------------------------------------------------------------
1 | init_resource_attributes();
19 |
20 | Utils::gzip_file(
21 | "tests/resources/gsa_rsid_map.txt",
22 | $resourcesDir . "/gsa_rsid_map.txt.gz"
23 | );
24 | Utils::gzip_file(
25 | "tests/resources/gsa_chrpos_map.txt",
26 | $resourcesDir . "/gsa_chrpos_map.txt.gz"
27 | );
28 | Utils::gzip_file(
29 | "tests/resources/dbsnp_151_37_reverse.txt",
30 | $resourcesDir . "/dbsnp_151_37_reverse.txt.gz"
31 | );
32 | }
33 |
34 | public static function teardownGsaTest() {
35 | $r = new Resources(
36 | resources_dir: "resources",
37 | );
38 | $r->init_resource_attributes();
39 | }
40 |
41 | public function testRead23AndMe()
42 | {
43 | $this->run_parse_tests("tests/input/23andme.txt", "23andMe");
44 | }
45 |
46 | public function testRead23AndMeAllele()
47 | {
48 | $this->run_parse_tests("tests/input/23andme_allele.txt", "23andMe");
49 | }
50 |
51 | public function testRead23AndMeWin()
52 | {
53 | $this->run_parse_tests("tests/input/23andme_win.txt", "23andMe");
54 | }
55 |
56 | protected function run_build_detection_test(
57 | $run_parsing_tests_func,
58 | $build_str,
59 | $build_int,
60 | $file="tests/input/testvcf.vcf",
61 | $source="vcf",
62 | $comment_str="##%s\n",
63 | $insertion_line=1
64 | ) {
65 | $s = "";
66 | $lines = file($file);
67 | foreach ($lines as $i => $line) {
68 | $s .= $line;
69 | if ($i == $insertion_line) {
70 | $s .= sprintf($comment_str, $build_str);
71 | }
72 | }
73 |
74 | $file_build_comment = tempnam(sys_get_temp_dir(), basename($file));
75 | file_put_contents($file_build_comment, $s);
76 |
77 | call_user_func(
78 | $run_parsing_tests_func,
79 | $file_build_comment,
80 | $source,
81 | $build_int,
82 | true
83 | );
84 | }
85 |
86 | public function testRead23AndMeBuild36()
87 | {
88 | $this->run_build_detection_test(
89 | array($this, 'run_parse_tests'),
90 | "build 36",
91 | 36,
92 | "tests/input/23andme.txt",
93 | "23andMe",
94 | "# %s\n"
95 | );
96 | }
97 |
98 |
99 | public function testRead23AndMeBuild37()
100 | {
101 | $this->run_build_detection_test(
102 | array($this, 'run_parse_tests'),
103 | "build 37",
104 | 37,
105 | "tests/input/23andme.txt",
106 | "23andMe",
107 | "# %s\n"
108 | );
109 | }
110 |
111 | public function testRead23AndMeBuild38()
112 | {
113 | $this->run_build_detection_test(
114 | array($this, 'run_parse_tests'),
115 | "build 38",
116 | 38,
117 | "tests/input/23andme.txt",
118 | "23andMe",
119 | "# %s\n"
120 | );
121 | }
122 |
123 | public function testReadAncestry()
124 | {
125 | // https://www.ancestry.com
126 | $this->run_parse_tests("tests/input/ancestry.txt", "AncestryDNA");
127 | }
128 |
129 | public function testReadAncestryExtraTab()
130 | {
131 | $total_snps = 100;
132 | $s = "#AncestryDNA\r\n";
133 | $s .= "rsid\tchromosome\tposition\tallele1\tallele2\r\n";
134 | // add extra tab separator in first line
135 | $s .= "rs1\t1\t101\t\tA\tA\r\n";
136 | // generate remainder of lines
137 | for ($i = 1; $i < $total_snps; $i++) {
138 | $s .= "rs" . (1 + $i) . "\t1\t" . (101 + $i) . "\tA\tA\r\n";
139 | }
140 |
141 | $snps_df = $this->create_snp_df(
142 | array_map(function ($i) {
143 | return "rs" . (1 + $i);
144 | }, range(0, $total_snps - 1)),
145 | "1",
146 | array_map(function ($i) {
147 | return 101 + $i;
148 | }, range(0, $total_snps - 1)),
149 | "AA"
150 | );
151 |
152 | // echo "snps_df: \n";
153 | // print_r($snps_df);
154 |
155 | $path = tempnam(sys_get_temp_dir(), "ancestry_extra_tab.txt");
156 | file_put_contents($path, $s);
157 |
158 | $this->run_parse_tests($path, "AncestryDNA", snps_df: $snps_df);
159 | }
160 |
161 | public function testReadAncestryMultiSep()
162 | {
163 | // https://www.ancestry.com
164 | $this->run_parse_tests("tests/input/ancestry_multi_sep.txt", "AncestryDNA");
165 | }
166 |
167 | public function testReadCodigo46()
168 | {
169 | // https://codigo46.com.mx
170 | static::setupGsaTest(sys_get_temp_dir());
171 | // $this->run_parse_tests("tests/input/codigo46.txt", "Codigo46");
172 | static::teardownGsaTest();
173 | }
174 |
175 | // def test_read_tellmeGen(self):
176 | // # https://www.tellmegen.com/
177 | // with tempfile.TemporaryDirectory() as tmpdir:
178 | // self._setup_gsa_test(tmpdir)
179 | // self.run_parsing_tests("tests/input/tellmeGen.txt", "tellmeGen")
180 | // self._teardown_gsa_test()
181 |
182 | // def test_read_DNALand(self):
183 | // # https://dna.land/
184 | // self.run_parsing_tests("tests/input/DNALand.txt", "DNA.Land")
185 |
186 | // def test_read_circledna(self):
187 | // # https://circledna.com/
188 | // df = self.generic_snps()
189 | // df.drop("rs5", inplace=True) # only called genotypes
190 | // self.run_parsing_tests("tests/input/circledna.txt", "CircleDNA", snps_df=df)
191 |
192 | // def test_read_ftdna(self):
193 | // # https://www.familytreedna.com
194 | // self.run_parsing_tests("tests/input/ftdna.csv", "FTDNA")
195 |
196 | // def test_read_ftdna_concat_gzip_extra_data(self):
197 | // # https://www.familytreedna.com
198 |
199 | // total_snps1 = 10
200 | // total_snps2 = 10
201 | // # generate content of first file
202 | // s1 = "RSID,CHROMOSOME,POSITION,RESULT\r\n"
203 | // for i in range(0, total_snps1):
204 | // s1 += f'"rs{1 + i}","1","{101 + i}","AA"\r\n'
205 |
206 | // # generate content of second file
207 | // s2 = "RSID,CHROMOSOME,POSITION,RESULT\r\n"
208 | // for i in range(0, total_snps2):
209 | // s2 += f'"rs{total_snps1 + 1 + i}","1","{ total_snps1 + 101 + i}","AA"\r\n'
210 | // snps_df = self.create_snp_df(
211 | // rsid=[f"rs{1 + i}" for i in range(0, total_snps1 + total_snps2)],
212 | // chrom="1",
213 | // pos=[101 + i for i in range(0, total_snps1 + total_snps2)],
214 | // genotype="AA",
215 | // )
216 |
217 | // with tempfile.TemporaryDirectory() as tmpdir:
218 | // file1 = os.path.join(tmpdir, "ftdna_concat_gzip1.csv")
219 | // file1_gz = f"{file1}.gz"
220 | // file2 = os.path.join(tmpdir, "ftdna_concat_gzip2.csv")
221 | // file2_gz = f"{file2}.gz"
222 | // path = os.path.join(tmpdir, "ftdna_concat_gzip.csv.gz")
223 |
224 | // # write individual files
225 | // with open(file1, "w") as f:
226 | // f.write(s1)
227 | // with open(file2, "w") as f:
228 | // f.write(s2)
229 |
230 | // # compress files
231 | // gzip_file(file1, file1_gz)
232 | // gzip_file(file2, file2_gz)
233 |
234 | // # concatenate gzips
235 | // with open(file1_gz, "rb") as f:
236 | // data = f.read()
237 | // with open(file2_gz, "rb") as f:
238 | // data += f.read()
239 |
240 | // # add extra data
241 | // data += b"extra data"
242 |
243 | // # write file with concatenated gzips and extra data
244 | // with open(path, "wb") as f:
245 | // f.write(data)
246 |
247 | // self.make_parsing_assertions(
248 | // self.parse_file(path), "FTDNA", False, 37, False, snps_df
249 | // )
250 | // self.make_parsing_assertions(
251 | // self.parse_bytes(path), "FTDNA", False, 37, False, snps_df
252 | // )
253 |
254 | // def test_read_ftdna_famfinder(self):
255 | // # https://www.familytreedna.com
256 | // self.run_parsing_tests("tests/input/ftdna_famfinder.csv", "FTDNA")
257 |
258 | // def test_read_ftdna_second_header(self):
259 | // # https://www.familytreedna.com
260 |
261 | // total_snps1 = 100
262 | // total_snps2 = 10
263 | // s = "RSID,CHROMOSOME,POSITION,RESULT\n"
264 | // # generate first chunk of lines
265 | // for i in range(0, total_snps1):
266 | // s += f'"rs{1 + i}","1","{101 + i}","AA"\n'
267 | // # add second header
268 | // s += "RSID,CHROMOSOME,POSITION,RESULT\n"
269 | // # generate second chunk of lines
270 | // for i in range(0, total_snps2):
271 | // s += f'"rs{total_snps1 + 1 + i}","1","{total_snps1 + 101 + i}","AA"\n'
272 |
273 | // snps_df = self.create_snp_df(
274 | // rsid=[f"rs{1 + i}" for i in range(0, total_snps1 + total_snps2)],
275 | // chrom="1",
276 | // pos=[101 + i for i in range(0, total_snps1 + total_snps2)],
277 | // genotype="AA",
278 | // )
279 |
280 | // with tempfile.TemporaryDirectory() as tmpdir:
281 | // path = os.path.join(tmpdir, "ftdna_second_header.txt")
282 | // with open(path, "w") as f:
283 | // f.write(s)
284 |
285 | // self.run_parsing_tests(path, "FTDNA", snps_df=snps_df)
286 |
287 |
288 |
289 | public function testReadGenericCsv()
290 | {
291 | $this->run_parse_tests("tests/input/generic.csv", "generic");
292 | }
293 |
294 | public function testReadGenericTsv()
295 | {
296 | $this->run_parse_tests("tests/input/generic.tsv", "generic");
297 | }
298 |
299 | public function testReadGenericExtraColumnTsv()
300 | {
301 | $this->run_parse_tests("tests/input/generic_extra_column.tsv", "generic");
302 | }
303 |
304 | public function testReadGenericHeaderComment()
305 | {
306 | $this->run_parse_tests("tests/input/generic_header_comment.tsv", "generic");
307 | }
308 |
309 | public function testReadGenericMultiRsidTsv()
310 | {
311 | $this->run_parse_tests("tests/input/generic_multi_rsid.tsv", "generic");
312 | }
313 |
314 | public function testReadGenericNoHeader()
315 | {
316 | $this->run_parse_tests("tests/input/generic_no_header.tsv", "generic");
317 | }
318 |
319 | public function testReadGenericNonStandardColumns()
320 | {
321 | $this->run_parse_tests(
322 | "tests/input/generic_non_standard_columns.tsv",
323 | "generic"
324 | );
325 | }
326 |
327 | }
--------------------------------------------------------------------------------
/tests/Snps/IO/WriterTes.php:
--------------------------------------------------------------------------------
1 | _reference_sequences["GRCh37"] = [];
27 |
28 | $output = $tmpdir1 . '/' . $outputFile;
29 | $tmpdir2 = sys_get_temp_dir() . '/' . uniqid();
30 | mkdir($tmpdir2);
31 |
32 | $dest = $tmpdir2 . '/generic.fa.gz';
33 | gzip_file("tests/input/generic.fa", $dest);
34 |
35 | $seq = new ReferenceSequence(
36 | "1",
37 | "",
38 | $dest
39 | );
40 |
41 | $r->_reference_sequences["GRCh37"]["1"] = $seq;
42 |
43 | if (!$filename) {
44 | $result = $s->{$funcStr}($kwargs);
45 | } else {
46 | $result = $s->{$funcStr}($filename, $kwargs);
47 | }
48 |
49 | $this->assertSame($result, $output);
50 |
51 | if ($expectedOutput) {
52 | // Read result
53 | $actual = file_get_contents($output);
54 |
55 | // Read expected result
56 | $expected = file_get_contents($expectedOutput);
57 |
58 | $this->assertStringContainsString($expected, $actual);
59 | }
60 |
61 |
62 | $this->runParsingTestsVcf($output);
63 | } else {
64 | $tmpdir = sys_get_temp_dir() . '/' . uniqid();
65 | mkdir($tmpdir);
66 |
67 | $snps = new SNPs("tests/input/generic.csv", output_dir: $tmpdir);
68 | $dest = $tmpdir . '/' . $outputFile;
69 |
70 | if (!$filename) {
71 | $this->assertSame($snps->{$funcStr}(), $dest);
72 | } else {
73 | $this->assertSame($snps->{$funcStr}($filename), $dest);
74 | }
75 |
76 |
77 | $this->run_parse_tests($dest, "generic");
78 | }
79 | }
80 |
81 | public function testToCsv()
82 | {
83 | $this->runWriterTest("to_csv", outputFile: "generic_GRCh37.csv");
84 | }
85 |
86 | public function testToCsvFilename()
87 | {
88 | $this->runWriterTest("to_csv", filename: "generic.csv", outputFile: "generic.csv");
89 | }
90 |
91 | public function testToTsv()
92 | {
93 | $this->runWriterTest("to_tsv", outputFile: "generic_GRCh37.txt");
94 | }
95 |
96 | public function testToTsvFilename()
97 | {
98 | $this->runWriterTest("to_tsv", filename: "generic.txt", outputFile: "generic.txt");
99 | }
100 |
101 | public function testToVcf()
102 | {
103 | $this->runWriterTest(
104 | "to_vcf",
105 | outputFile: "vcf_GRCh37.vcf",
106 | expectedOutput: "tests/output/vcf_generic.vcf"
107 | );
108 | }
109 |
110 | public function testToVcfFilename()
111 | {
112 | $this->runWriterTest("to_vcf", filename: "vcf.vcf", outputFile: "vcf.vcf");
113 | }
114 |
115 | public function testToVcfChromPrefix()
116 | {
117 | $this->runWriterTest(
118 | "to_vcf",
119 | outputFile: "vcf_GRCh37.vcf",
120 | expectedOutput: "tests/output/vcf_chrom_prefix_chr.vcf",
121 | kwargs: ["chrom_prefix" => "chr"]
122 | );
123 | }
124 |
125 | public function testSaveSnpsFalsePositiveBuild()
126 | {
127 | // Create a temporary directory
128 | $tmpdir = tempnam(sys_get_temp_dir(), 'tmp');
129 | unlink($tmpdir);
130 | mkdir($tmpdir);
131 |
132 | // Instantiate SNPs with input file and output directory
133 | $snps = new SNPs("tests/input/generic.csv", ["output_dir" => $tmpdir]);
134 |
135 | // Define output file path
136 | $output = $tmpdir . "/generic_GRCh37.txt";
137 |
138 | // Save SNPs to TSV
139 | $this->assertEquals($output, $snps->toTsv());
140 |
141 | // Modify the output file to add version information
142 | $s = "";
143 | $lines = file($output);
144 | foreach ($lines as $line) {
145 | if (strpos($line, "snps v") !== false) {
146 | $s .= "# Generated by snps v1.2.3.post85.dev0+gb386302, https://pypi.org/project/snps/\n";
147 | } else {
148 | $s .= $line;
149 | }
150 | }
151 |
152 | file_put_contents($output, $s);
153 |
154 | // Run parsing tests on the modified output
155 | $this->runParsingTests($output, "generic");
156 |
157 | // Clean up
158 | unlink($output);
159 | rmdir($tmpdir);
160 | }
161 |
162 | public function testSaveSnpsVcfFalsePositiveBuild()
163 | {
164 | $tmpdir1 = sys_get_temp_dir() . '/tmpdir1';
165 | mkdir($tmpdir1);
166 |
167 | // Instantiate SNPs with input file and output directory
168 | $snps = new SNPs("tests/input/testvcf.vcf", ["output_dir" => $tmpdir1]);
169 |
170 | $r = new Resources();
171 | $r->_reference_sequences["GRCh37"] = [];
172 |
173 | $output = $tmpdir1 . "/vcf_GRCh37.vcf";
174 | $tmpdir2 = sys_get_temp_dir() . '/tmpdir2';
175 | mkdir($tmpdir2);
176 |
177 | $dest = $tmpdir2 . "/generic.fa.gz";
178 | gzip_file("tests/input/generic.fa", $dest);
179 |
180 | $seq = new ReferenceSequence(["ID" => "1", "path" => $dest]);
181 |
182 | $r->_reference_sequences["GRCh37"]["1"] = $seq;
183 |
184 | $this->assertEquals($output, $snps->toVcf());
185 |
186 | $s = "";
187 | $lines = file($output);
188 | foreach ($lines as $line) {
189 | if (strpos($line, "snps v") !== false) {
190 | $s .= '##source="vcf; snps v1.2.3.post85.dev0+gb386302; https://pypi.org/project/snps/"' . "\n";
191 | } else {
192 | $s .= $line;
193 | }
194 | }
195 |
196 | file_put_contents($output, $s);
197 |
198 | $this->runParsingTestsVcf($output);
199 |
200 | // Clean up
201 | unlink($output);
202 | rmdir($tmpdir1);
203 | unlink($dest);
204 | rmdir($tmpdir2);
205 | }
206 |
207 |
208 | public function testSaveSnpsVcfPhased()
209 | {
210 | $tmpdir1 = sys_get_temp_dir() . '/tmpdir1';
211 | mkdir($tmpdir1);
212 |
213 | // Instantiate SNPs with input phased VCF file and output directory
214 | $s = new SNPs("tests/input/testvcf_phased.vcf", ["output_dir" => $tmpdir1]);
215 |
216 | // Setup resource to use test FASTA reference sequence
217 | $r = new Resources();
218 | $r->_reference_sequences["GRCh37"] = [];
219 |
220 | $output = $tmpdir1 . "/vcf_GRCh37.vcf";
221 | $tmpdir2 = sys_get_temp_dir() . '/tmpdir2';
222 | mkdir($tmpdir2);
223 |
224 | $dest = $tmpdir2 . "/generic.fa.gz";
225 | gzip_file("tests/input/generic.fa", $dest);
226 |
227 | $seq = new ReferenceSequence(["ID" => "1", "path" => $dest]);
228 |
229 | $r->_reference_sequences["GRCh37"]["1"] = $seq;
230 |
231 | // Save phased data to VCF
232 | $this->assertEquals($output, $s->toVcf());
233 |
234 | // Read saved VCF with phased data
235 | $this->runParsingTestsVcf($output, true);
236 |
237 | // Clean up
238 | unlink($output);
239 | rmdir($tmpdir1);
240 | unlink($dest);
241 | rmdir($tmpdir2);
242 | }
243 |
244 |
245 | public function testSaveSnpsPhased()
246 | {
247 | $tmpdir = sys_get_temp_dir() . '/tmpdir';
248 | mkdir($tmpdir);
249 |
250 | // Instantiate SNPs with input phased VCF file and output directory
251 | $s = new SNPs("tests/input/testvcf_phased.vcf", ["output_dir" => $tmpdir]);
252 |
253 | $dest = $tmpdir . "/vcf_GRCh37.txt";
254 |
255 | // Save phased data to TSV
256 | $this->assertEquals($dest, $s->toTsv());
257 |
258 | // Read saved TSV with phased data
259 | $this->runParsingTestsVcf($dest, true);
260 |
261 | // Clean up
262 | unlink($dest);
263 | rmdir($tmpdir);
264 | }
265 |
266 |
267 | public function runVcfQcTest($expectedOutput, $vcfQcOnly, $vcfQcFilter, $cluster = "c1")
268 | {
269 | $tmpdir1 = sys_get_temp_dir() . '/tmpdir1';
270 | mkdir($tmpdir1);
271 |
272 | // Instantiate SNPs with input CSV file and output directory
273 | $s = new SNPs("tests/input/generic.csv", ["output_dir" => $tmpdir1]);
274 |
275 | // Setup resource to use test FASTA reference sequence
276 | $r = new Resources();
277 | $r->setReferenceSequences(["GRCh37" => []]);
278 |
279 | $output = $tmpdir1 . "/generic_GRCh37.vcf";
280 |
281 | $tmpdir2 = sys_get_temp_dir() . '/tmpdir2';
282 | mkdir($tmpdir2);
283 | $dest = $tmpdir2 . "/generic.fa.gz";
284 | gzipFile("tests/input/generic.fa", $dest);
285 |
286 | $seq = new ReferenceSequence(ID: "1", path: $dest);
287 | $r->getReferenceSequences("GRCh37")["1"] = $seq;
288 |
289 | // Save data to VCF with quality control settings
290 | $options = ["qc_only" => $vcfQcOnly, "qc_filter" => $vcfQcFilter];
291 | $this->assertEquals($output, $s->toVcf($options));
292 |
293 | // Read result
294 | $actual = file_get_contents($output);
295 |
296 | // Read expected result
297 | $expected = file_get_contents($expectedOutput);
298 |
299 | $this->assertStringContainsString($expected, $actual);
300 |
301 | if (!$vcfQcFilter || !$cluster) {
302 | $this->assertStringNotContainsString("##FILTER=runVcfQcTest(
315 | "tests/output/vcf_qc/qc_only_F_qc_filter_F.vcf",
316 | false,
317 | false
318 | );
319 | }
320 |
321 | public function testSaveVcfQcOnlyFalseQcFilterTrue()
322 | {
323 | $this->runVcfQcTest(
324 | "tests/output/vcf_qc/qc_only_F_qc_filter_T.vcf",
325 | false,
326 | true
327 | );
328 | }
329 |
330 | public function testSaveVcfQcOnlyTrueQcFilterFalse()
331 | {
332 | $this->runVcfQcTest(
333 | "tests/output/vcf_qc/qc_only_T_qc_filter_F.vcf",
334 | true,
335 | false
336 | );
337 | }
338 |
339 | public function testSaveVcfQcOnlyTrueQcFilterTrue()
340 | {
341 | $this->runVcfQcTest(
342 | "tests/output/vcf_qc/qc_only_T_qc_filter_T.vcf",
343 | true,
344 | true
345 | );
346 | }
347 |
348 | public function testSaveVcfNoClusterQcOnlyFalseQcFilterFalse()
349 | {
350 | $this->runVcfQcTest(
351 | "tests/output/vcf_qc/qc_only_F_qc_filter_F.vcf",
352 | false,
353 | false,
354 | ""
355 | );
356 | }
357 |
358 | public function testSaveVcfNoClusterQcOnlyFalseQcFilterTrue()
359 | {
360 | $this->runVcfQcTest(
361 | "tests/output/vcf_qc/qc_only_F_qc_filter_T.vcf",
362 | false,
363 | true,
364 | ""
365 | );
366 | }
367 |
368 | public function testSaveVcfNoClusterQcOnlyTrueQcFilterFalse()
369 | {
370 | $this->runVcfQcTest(
371 | "tests/output/vcf_qc/qc_only_T_qc_filter_F.vcf",
372 | true,
373 | false,
374 | ""
375 | );
376 | }
377 |
378 | public function testSaveVcfNoClusterQcOnlyTrueQcFilterTrue()
379 | {
380 | $this->runVcfQcTest(
381 | "tests/output/vcf_qc/qc_only_T_qc_filter_T.vcf",
382 | true,
383 | true,
384 | ""
385 | );
386 | }
387 | }
388 |
--------------------------------------------------------------------------------
/src/Resources.php:
--------------------------------------------------------------------------------
1 |
9 | * @copyright Copyright (c) 2020-2023, Liberu Software Ltd
10 | * @license MIT
11 | *
12 | * @link http://github.com/laravel-liberu/php-dna
13 | */
14 |
15 | namespace Dna;
16 |
17 | use Exception;
18 |
19 | /**
20 | * Class Resources.
21 | */
22 | class Resources extends \Dna\Snps\SNPsResources {
23 |
24 | protected $_genetic_map = '{}';
25 | protected $_genetic_map_name = '';
26 | protected $_cytoBand_hg19 = [];
27 | protected $_knownGene_hg19 = [];
28 | protected $_kgXref_hg19 = [];
29 |
30 | public function __construct($resources_dir = 'resources')
31 | {
32 | parent::__construct($resources_dir = $resources_dir);
33 | }
34 |
35 |
36 | // {
37 | // // Check if the current genetic map is already HapMap2
38 | // if ($this->_genetic_map_name !== "HapMap2") {
39 | // // If not already HapMap2, load the HapMap2 genetic map and set it as the current genetic map
40 | // $this->_genetic_map = $this->_load_genetic_map_HapMapII_GRCh37(
41 | // $this->_get_path_genetic_map_HapMapII_GRCh37()
42 | // );
43 | // $this->_genetic_map_name = "HapMap2";
44 | // }
45 |
46 | // // Return the HapMap2 genetic map in GRCh37 format
47 | // return $this->_genetic_map;
48 | // }
49 |
50 | // /**
51 | // * Returns the genetic map for a given population in the 1000 Genomes Project GRCh37 reference genome.
52 | // *
53 | // * @param string $pop The population code (e.g. "CEU", "YRI", "CHB") for which to retrieve the genetic map.
54 | // * @return array The genetic map for the specified population.
55 | // */
56 | // public function get_genetic_map_1000G_GRCh37(string $pop): array
57 | // {
58 | // // Check if the requested genetic map is already loaded
59 | // if ($this->_genetic_map_name !== $pop) {
60 | // // If not, load the genetic map from file
61 | // $this->_genetic_map = $this->_load_genetic_map_1000G_GRCh37(
62 | // $this->_get_path_genetic_map_1000G_GRCh37(pop: $pop)
63 | // );
64 | // // Update the name of the loaded genetic map
65 | // $this->_genetic_map_name = $pop;
66 | // }
67 |
68 | // // Return the loaded genetic map
69 | // return $this->_genetic_map;
70 | // }
71 |
72 | /**
73 | * Returns the cytogenetic banding information for the hg19 reference genome.
74 | *
75 | * @return array The cytogenetic banding information for hg19.
76 | */
77 | public function getCytoBandHg19(): array
78 | {
79 | // Check if the cytogenetic banding information for hg19 is already loaded
80 | if (empty($this->_cytoBand_hg19)) {
81 | // If not, load the cytogenetic banding information from file
82 | $this->_cytoBand_hg19 = $this->loadCytoBand($this->getPathCytoBandHg19());
83 | }
84 |
85 | return $this->_cytoBand_hg19;
86 | }
87 |
88 | // /**
89 | // * Returns the knownGene_hg19 data.
90 | // *
91 | // * @return array The knownGene_hg19 data.
92 | // */
93 | // public function get_knownGene_hg19() {
94 | // // Check if the _knownGene_hg19 property is empty.
95 | // if ($this->_knownGene_hg19->empty()) {
96 | // // If it is empty, load the knownGene_hg19 data from the file path.
97 | // $this->_knownGene_hg19 = $this->_load_knownGene(
98 | // $this->_get_path_knownGene_hg19()
99 | // );
100 | // }
101 | // // Return the knownGene_hg19 data.
102 | // return $this->_knownGene_hg19;
103 | // }
104 |
105 | /**
106 | * Returns the kgXref data for the hg19 reference genome.
107 | *
108 | * @return array The kgXref data for hg19.
109 | */
110 | public function getKgXrefHg19(): array
111 | {
112 | // Check if the _kgXref_hg19 property is empty.
113 | if (empty($this->_kgXref_hg19)) {
114 | // If it is empty, load the kgXref_hg19 data from the file path.
115 | $this->_kgXref_hg19 = $this->loadKgXref(
116 | $this->getPathKgXrefHg19()
117 | );
118 | }
119 |
120 | return $this->_kgXref_hg19;
121 | }
122 |
123 | // public function _load_genetic_map_HapMapII_GRCh37($filename)
124 | // {
125 | // $genetic_map = array( );
126 | // $archive = new PharData($filename);
127 | // foreach($archive as $file) {
128 | // if (strpos("genetic_map",$file["name"])===true){
129 | // $df = array( );
130 | // if (($handle = fopen($file, "r")) !== FALSE) {
131 | // while (($data = fgetcsv($handle,"\t")) !== FALSE) {
132 | // $df["Position(bp)"]=$data["pos"];
133 | // $df["Rate(cM/Mb)"]=$data["rate"];
134 | // $df["Map(cM)"]=$data["map"];
135 | // }
136 | // fclose($handle);
137 | // }
138 | // $start_pos = strpos($file["name"],"chr") + 3;
139 | // $end_pos = strpos($file["name"],".");
140 | // $genetic_map[substr($file["name"],$start_pos,$end_pos)] = $df;
141 | // }
142 | // }
143 | // $genetic_map["X"] = array_merge(
144 | // $genetic_map["X_par1"], $genetic_map["X"], $genetic_map["X_par2"]
145 | // );
146 | // $genetic_map["X_par1"]=array( );
147 | // $genetic_map["X_par2"]=array( );
148 | // return $genetic_map;
149 | // }
150 |
151 | /**
152 | * Loads a genetic map from a file in the 1000 Genomes Project format (GRCh37).
153 | *
154 | * @param string $filename The path to the file to load.
155 | * @return array An associative array of genetic maps, keyed by chromosome.
156 | */
157 | function loadGeneticMap1000GGRCh37($filename)
158 | {
159 | $geneticMap = []; // Initialize an empty array to hold the genetic maps.
160 |
161 | $phar = new PharData($filename); // Create a new PharData object from the file.
162 |
163 | foreach ($phar as $member) { // Loop through each file in the Phar archive.
164 | $filepath = $member->getPathname(); // Get the path to the file.
165 | $geneticMap = $this->processGeneticMapFile($filepath, $geneticMap);
166 | }
167 |
168 | return $geneticMap; // Return the $geneticMap array.
169 | }
170 |
171 | /**
172 | * Processes a single genetic map file and adds the data to the $geneticMap array.
173 | *
174 | * @param string $filepath The path to the genetic map file.
175 | * @param array $geneticMap The array to add the genetic map data to.
176 | * @return array The updated $geneticMap array.
177 | */
178 | function processGeneticMapFile($filepath, $geneticMap)
179 | {
180 | $file = gzopen($filepath, 'r'); // Open the file for reading.
181 | $header = fgetcsv($file, 0, "\t"); // Read the header row of the CSV file.
182 |
183 | $tempFile = []; // Initialize an empty array to hold the data rows.
184 | while (($data = fgetcsv($file, 0, "\t")) !== false) { // Loop through each row of the CSV file.
185 | if (count($data) == count($header)) { // Check that the row has the same number of columns as the header.
186 | $tempFile[] = array_combine($header, $data); // Combine the header and data rows into an associative array.
187 | }
188 | }
189 |
190 | $df = []; // Initialize an empty array to hold the genetic map data.
191 | foreach ($tempFile as $row) { // Loop through each row of the data.
192 | $df[] = [ // Add a new array to the $df array.
193 | "pos" => $row["Position(bp)"], // Add the position to the array.
194 | "rate" => $row["Rate(cM/Mb)"], // Add the rate to the array.
195 | "map" => $row["Map(cM)"], // Add the map to the array.
196 | ];
197 | }
198 |
199 | $chrom = explode("-", basename($filepath))[1]; // Get the chromosome number from the filename.
200 | $geneticMap[$chrom] = $df; // Add the genetic map data to the $geneticMap array, keyed by chromosome.
201 |
202 | return $geneticMap;
203 | }
204 |
205 | // // public function downloadFile($url, $filename, $compress=False, $timeout=30)
206 | // // {
207 | // // if(strpos($url, "ftp://") !== false) {
208 | // // $url=str_replace($url,"ftp://", "http://");
209 | // // }
210 | // // if ($compress && substr($filename,strlen($filename)-3,strlen($filename)) != ".gz"){
211 | // // $filename = $filename+".gz";
212 | // // }
213 | // // $destination = join($this->resources_dir, $filename);
214 |
215 | // // if (!mkdir($destination)){
216 | // // return "";
217 | // // }
218 | // // if (file_exists($destination)) {
219 | // // $file_url = $destination;
220 | // // header('Content-Type: application/octet-stream');
221 | // // header('Content-Description: File Transfer');
222 | // // header('Content-Disposition: attachment; filename=' . $filename);
223 | // // header('Expires: 0');
224 | // // header('Cache-Control: must-revalidate');
225 | // // header('Pragma: public');
226 | // // header('Content-Length: ' . filesize($file_url));
227 | // // readfile($file_url);
228 |
229 | // // // if $compress
230 | // // // $this->_write_data_to_gzip(f, data)
231 | // // // else
232 | // // // f.write(data)
233 | // // }
234 | // // return $destination;
235 | // // }
236 |
237 | // /**
238 | // * Load UCSC knownGene table.
239 | // *
240 | // * @param string $filename Path to knownGene file
241 | // *
242 | // * @return array KnownGene table (associative array)
243 | // */
244 | // public static function loadKnownGene(string $filename): array
245 | // {
246 | // $file = fopen($filename, 'r');
247 | // $headers = [
248 | // 'name',
249 | // 'chrom',
250 | // 'strand',
251 | // 'txStart',
252 | // 'txEnd',
253 | // 'cdsStart',
254 | // 'cdsEnd',
255 | // 'exonCount',
256 | // 'exonStarts',
257 | // 'exonEnds',
258 | // 'proteinID',
259 | // 'alignID',
260 | // ];
261 | // $knownGene = [];
262 |
263 | // while (($row = fgetcsv($file, 0, "\t")) !== false) {
264 | // $rowData = array_combine($headers, $row);
265 | // $rowData['chrom'] = substr($rowData['chrom'], 3);
266 | // $knownGene[$rowData['name']] = $rowData;
267 | // }
268 |
269 | // fclose($file);
270 |
271 | // return $knownGene;
272 | // }
273 |
274 | // /**
275 | // * Load UCSC kgXref table.
276 | // *
277 | // * @param string $filename Path to kgXref file
278 | // *
279 | // * @return array kgXref table (associative array)
280 | // */
281 | // public static function loadKgXref(string $filename): array
282 | // {
283 | // $file = fopen($filename, 'r');
284 | // $headers = [
285 | // 'kgID',
286 | // 'mRNA',
287 | // 'spID',
288 | // 'spDisplayID',
289 | // 'geneSymbol',
290 | // 'refseq',
291 | // 'protAcc',
292 | // 'description',
293 | // 'rfamAcc',
294 | // 'tRnaName',
295 | // ];
296 | // $kgXref = [];
297 |
298 | // while (($row = fgetcsv($file, 0, "\t")) !== false) {
299 | // $rowData = array_combine($headers, $row);
300 | // $kgXref[$rowData['kgID']] = $rowData;
301 | // }
302 |
303 | // fclose($file);
304 |
305 | // return $kgXref;
306 | // }
307 |
308 | /**
309 | * Get local path to cytoBand file for hg19 / GRCh37 from UCSC, downloading if necessary.
310 | *
311 | * @return string Path to cytoBand_hg19.txt.gz
312 | */
313 | public function getPathCytoBandHg19(): string
314 | {
315 | return $this->downloadFile(
316 | 'ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz',
317 | 'cytoBand_hg19.txt.gz'
318 | );
319 | }
320 |
321 | // /**
322 | // * Download file from a given URL if not exists and return its local path.
323 | // *
324 | // * @param string $url URL of the file to download
325 | // * @param string $filename Local name for the downloaded file
326 | // *
327 | // * @return string Local path to the downloaded file
328 | // */
329 | // protected function downloadFile(string $url, string $filename): string
330 | // {
331 | // $path = __DIR__ . '/' . $filename;
332 | // if (!file_exists($path)) {
333 | // $parsedUrl = parse_url($url);
334 | // $host = $parsedUrl['host'];
335 | // $remotePath = $parsedUrl['path'];
336 |
337 | // $conn = ftp_connect($host);
338 | // if ($conn) {
339 | // $loggedIn = ftp_login($conn, 'anonymous', '');
340 | // if ($loggedIn) {
341 | // ftp_pasv($conn, true);
342 | // $downloaded = ftp_get($conn, $path, $remotePath, FTP_BINARY);
343 | // if (!$downloaded) {
344 | // throw new Exception("Failed to download the file '{$url}'.");
345 | // }
346 | // ftp_close($conn);
347 | // } else {
348 | // throw new Exception("Failed to log in to the FTP server '{$host}'.");
349 | // }
350 | // } else {
351 | // throw new Exception("Failed to connect to the FTP server '{$host}'.");
352 | // }
353 | // }
354 |
355 | // return $path;
356 | // }
357 |
358 | // /**
359 | // * Get local path to HapMap Phase II genetic map for hg19 / GRCh37 (HapMapII), downloading if necessary
360 | // *
361 | // * @return string Path to genetic_map_HapMapII_GRCh37.tar.gz
362 | // */
363 | // public function getPathGeneticMapHapMapIIGRCh37(): string
364 | // {
365 | // return $this->downloadFile(
366 | // 'ftp://ftp.ncbi.nlm.nih.gov/hapmap/recombination/2011-01_phaseII_B37/genetic_map_HapMapII_GRCh37.tar.gz',
367 | // 'genetic_map_HapMapII_GRCh37.tar.gz'
368 | // );
369 | // }
370 |
371 | // /**
372 | // * Get local path to population-specific 1000 Genomes Project genetic map,
373 | // * downloading if necessary.
374 | // *
375 | // * @param string $pop
376 | // * @return string path to {pop}_omni_recombination_20130507.tar
377 | // */
378 | // public function getGeneticMap1000G_GRCh37($pop)
379 | // {
380 | // $filename = "{$pop}_omni_recombination_20130507.tar";
381 | // return $this->downloadFile(
382 | // "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130507_omni_recombination_rates/{$filename}",
383 | // $filename
384 | // );
385 | // }
386 |
387 | // /**
388 | // * Downloads the knownGene.txt.gz file for the hg19 genome assembly from the UCSC Genome Browser FTP server.
389 | // *
390 | // * @return string The path to the downloaded file.
391 | // */
392 | // public function get_path_knownGene_hg19(): string {
393 | // // Download the file from the UCSC Genome Browser FTP server.
394 | // // The file is located at ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.txt.gz
395 | // // and will be saved as knownGene_hg19.txt.gz in the current directory.
396 | // return $this->download_file(
397 | // "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.txt.gz",
398 | // "knownGene_hg19.txt.gz"
399 | // );
400 | // }
401 |
402 | // /**
403 | // * Downloads the kgXref.txt.gz file for the hg19 genome assembly from the UCSC Genome Browser FTP server.
404 | // *
405 | // * @return string The path to the downloaded file.
406 | // */
407 | // public function get_path_kgXref_hg19(): string {
408 | // // Download the file from the UCSC Genome Browser FTP server.
409 | // // The file is located at ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/kgXref.txt.gz
410 | // // and will be saved as kgXref_hg19.txt.gz in the current directory.
411 | // return $this->download_file(
412 | // "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/kgXref.txt.gz",
413 | // "kgXref_hg19.txt.gz"
414 | // );
415 | // }
416 |
417 |
418 |
419 | // public function download_example_datasets() : array
420 | // {
421 | // return [
422 | // $this->downloadFile(
423 | // "https://opensnp.org/data/662.23andme.340",
424 | // "662.23andme.340.txt.gz",
425 | // $compress=True
426 | // ),
427 | // $this->downloadFile(
428 | // "https://opensnp.org/data/662.ftdna-illumina.341",
429 | // "662.ftdna-illumina.341.csv.gz",
430 | // $compress=True
431 | // ),
432 | // $this->downloadFile(
433 | // "https://opensnp.org/data/663.23andme.305",
434 | // "663.23andme.305.txt.gz",
435 | // $compress=True
436 | // ),
437 | // $this->downloadFile(
438 | // "https://opensnp.org/data/4583.ftdna-illumina.3482",
439 | // "4583.ftdna-illumina.3482.csv.gz"
440 | // ),
441 | // $this->downloadFile(
442 | // "https://opensnp.org/data/4584.ftdna-illumina.3483",
443 | // "4584.ftdna-illumina.3483.csv.gz"
444 | // ),
445 | // ];
446 | // }
447 | }
448 |
449 | ?>
--------------------------------------------------------------------------------
/tests/Snps/SnpsMergeTest.php:
--------------------------------------------------------------------------------
1 | assertEquals(count($results), count($expectedResults));
17 |
18 | foreach ($results as $i => $result) {
19 | $expectedResult = $expectedResults[$i];
20 |
21 | $this->assertEquals(
22 | [
23 | "common_rsids",
24 | "discrepant_genotype_rsids",
25 | "discrepant_position_rsids",
26 | "merged",
27 | ],
28 | sort(array_keys($result))
29 | );
30 |
31 | if (array_key_exists("merged", $expectedResult)) {
32 | if ($expectedResult["merged"]) {
33 | $this->assertTrue($result["merged"]);
34 | } else {
35 | $this->assertFalse($result["merged"]);
36 | }
37 | } else {
38 | $this->assertFalse($result["merged"]);
39 | }
40 |
41 | foreach (["common_rsids", "discrepant_position_rsids", "discrepant_genotype_rsids"] as $key) {
42 | if (array_key_exists($key, $expectedResult)) {
43 | $this->assertEquals(
44 | $result[$key],
45 | $expectedResult[$key],
46 | true,
47 | true
48 | );
49 | } else {
50 | $this->assertTrue($result[$key]->isEmpty());
51 | $this->assertEquals($result[$key]->getName(), "rsid");
52 | }
53 | }
54 | }
55 | }
56 |
57 | public function testSourceSNPs()
58 | {
59 | $tmpdir = sys_get_temp_dir();
60 |
61 | $initial = new SNPs("tests/input/GRCh37.csv", output_dir: $tmpdir);
62 | $this->assertEquals($initial->getSource(), "generic");
63 | $initial->merge([new SNPs("tests/input/23andme.txt")]);
64 |
65 | $this->assertEquals($initial->getSource(), "generic, 23andMe");
66 |
67 | $this->assertEquals($initial->getAllSources(), ["generic", "23andMe"]);
68 | $mergedFile = $tmpdir . "/generic__23andMe_GRCh37.txt";
69 | $this->assertEquals($initial->toTsv(), $mergedFile);
70 |
71 | $fromFile = new SNPs($mergedFile);
72 |
73 |
74 | $this->assertEquals($initial->getSnps(), $fromFile->getSnps());
75 | $this->assertResults($fromFile, [["merged" => true]]);
76 | }
77 |
78 | public function testMergeList()
79 | {
80 | $s = new SNPs();
81 | $results = $s->merge([new SNPs("tests/input/GRCh37.csv"), new SNPs("tests/input/GRCh37.csv")]);
82 | $this->assertEquals($s->getSnps(), self::snps_GRCh37());
83 | $this->assertEquals($s->getSource(), "generic, generic");
84 | $this->assertEquals($s->getAllSources(), ["generic", "generic"]);
85 |
86 | $expectedResults = [
87 | ["merged" => true],
88 | [
89 | "merged" => true,
90 | "common_rsids" => [
91 | "rs3094315",
92 | "rs2500347",
93 | "rsIndelTest",
94 | "rs11928389",
95 | ],
96 | ],
97 | ];
98 | $this->assertResults($results, $expectedResults);
99 | }
100 |
101 | public function testMergeRemapping()
102 | {
103 | $s = new SNPs("tests/input/NCBI36.csv");
104 |
105 | $results = $s->merge([new SNPs("tests/input/GRCh37.csv")]);
106 |
107 | // Check that there are no discrepancies in merge positions and genotypes
108 | $this->assertCount(0, $s->getDiscrepantMergePositions());
109 | $this->assertCount(0, $s->getDiscrepantMergeGenotypes());
110 |
111 | // Compare the 'snps' attribute of 's' with the expected array directly
112 | $this->assertEquals($s->getSnps(), self::snps_NCBI36());
113 |
114 | // Check the results of the merge operation
115 | $expectedResults = [
116 | [
117 | "merged" => true,
118 | "common_rsids" => [
119 | "rs3094315",
120 | "rs2500347",
121 | "rsIndelTest",
122 | "rs11928389",
123 | ],
124 | ],
125 | ];
126 | $this->assertResults($results, $expectedResults);
127 | }
128 |
129 | public function testMergeRemapFalse()
130 | {
131 | $s = new SNPs("tests/input/NCBI36.csv");
132 |
133 | $results = $s->merge([new SNPs("tests/input/GRCh37.csv")], false);
134 |
135 | // Check the count of discrepancies in merge positions
136 | $this->assertCount(4, $s->getDiscrepantMergePositions());
137 | // Compare the discrepancies in merge positions with the expected results
138 | $this->assertSame(
139 | $s->getDiscrepantMergePositions(),
140 | $results[0]["discrepant_position_rsids"]
141 | );
142 |
143 | // Check the count of discrepancies in merge genotypes
144 | $this->assertCount(1, $s->getDiscrepantMergeGenotypes());
145 | // Compare the discrepancies in merge genotypes with the expected results
146 | $this->assertSame(
147 | $s->getDiscrepantMergeGenotypes(),
148 | $results[0]["discrepant_genotype_rsids"]
149 | );
150 |
151 | // Check the count of discrepancies in both positions and genotypes
152 | $this->assertCount(4, $s->getDiscrepantMergePositionsGenotypes());
153 | // Compare the discrepancies in both positions and genotypes with the expected results
154 | $this->assertSame(
155 | $s->getDiscrepantMergePositionsGenotypes(),
156 | $results[0]["discrepant_position_rsids"]
157 | );
158 |
159 | // Define the expected array for snps_NCBI36 with the discrepant genotype set to null/NA
160 | $expected = self::snps_NCBI36();
161 | $expected["rs11928389"]["genotype"] = null;
162 |
163 | // Compare the 'snps' attribute of 's' with the expected array directly
164 | $this->assertEquals($s->getSnps(), $expected);
165 |
166 | // Check the results of the merge operation
167 | $expectedResults = [
168 | [
169 | "merged" => true,
170 | "common_rsids" => [
171 | "rs3094315",
172 | "rs2500347",
173 | "rsIndelTest",
174 | "rs11928389",
175 | ],
176 | "discrepant_position_rsids" => [
177 | "rs3094315",
178 | "rs2500347",
179 | "rsIndelTest",
180 | "rs11928389",
181 | ],
182 | "discrepant_genotype_rsids" => ["rs11928389"],
183 | ],
184 | ];
185 | $this->assertResults($results, $expectedResults);
186 | }
187 |
188 |
189 | public function testMergePhased()
190 | {
191 | $s1 = new SNPs("tests/input/generic.csv");
192 | $s2 = new SNPs("tests/input/generic.csv");
193 | $s1->setPhased(true);
194 | $s2->setPhased(true);
195 |
196 | $results = $s1->merge([$s2]);
197 |
198 | // Check if 's1' is marked as phased
199 | $this->assertTrue($s1->isPhased());
200 |
201 | // Compare the 'snps' attribute of 's1' with the expected array directly
202 | $this->assertEquals($s1->getSnps(), self::genericSnps());
203 |
204 | // Check the results of the merge operation
205 | $expectedResults = [
206 | [
207 | "merged" => true,
208 | "common_rsids" => [
209 | "rs1", "rs2", "rs3", "rs4",
210 | "rs5", "rs6", "rs7", "rs8"
211 | ],
212 | ],
213 | ];
214 | $this->assertResults($results, $expectedResults);
215 | }
216 |
217 | public function testMergeUnphased()
218 | {
219 | $s1 = new SNPs("tests/input/generic.csv");
220 | $s2 = new SNPs("tests/input/generic.csv");
221 | $s1->setPhased(true);
222 |
223 | $results = $s1->merge([$s2]);
224 |
225 | // Check if 's1' is marked as unphased (not phased)
226 | $this->assertFalse($s1->isPhased());
227 |
228 | // Compare the 'snps' attribute of 's1' with the expected array directly
229 | $this->assertEquals($s1->getSnps(), self::genericSnps());
230 |
231 | // Check the results of the merge operation
232 | $expectedResults = [
233 | [
234 | "merged" => true,
235 | "common_rsids" => [
236 | "rs1", "rs2", "rs3", "rs4",
237 | "rs5", "rs6", "rs7", "rs8"
238 | ],
239 | ],
240 | ];
241 | $this->assertResults($results, $expectedResults);
242 | }
243 |
244 | public function testMergeNonExistentFile()
245 | {
246 | $s = new SNPs();
247 | $results = $s->merge([
248 | new SNPs("tests/input/non_existent_file.csv"),
249 | new SNPs("tests/input/GRCh37.csv")
250 | ]);
251 |
252 | // Compare the 'snps' attribute of 's' with the expected array directly
253 | $this->assertEquals($s->getSnps(), self::snps_GRCh37());
254 |
255 | // Check the results of the merge operation
256 | $expectedResults = [
257 | [], // No merge for the non-existent file
258 | ["merged" => true],
259 | ];
260 | $this->assertResults($results, $expectedResults);
261 | }
262 |
263 | public function testMergeInvalidFile()
264 | {
265 | $s = new SNPs();
266 | $results = $s->merge([
267 | new SNPs("tests/input/GRCh37.csv"),
268 | new SNPs("tests/input/empty.txt")
269 | ]);
270 |
271 | // Compare the 'snps' attribute of 's' with the expected array directly
272 | $this->assertEquals($s->getSnps(), self::snps_GRCh37());
273 |
274 | // Check the results of the merge operation
275 | $expectedResults = [
276 | ["merged" => true], // Merge with the valid file
277 | [], // No merge for the invalid file
278 | ];
279 | $this->assertResults($results, $expectedResults);
280 | }
281 |
282 | public function testMergeExceedDiscrepantPositionsThreshold()
283 | {
284 | $s1 = new SNPs("tests/input/generic.csv");
285 | $s2 = new SNPs("tests/input/generic.csv");
286 | $s2->getSnps()["rs1"]["pos"] = 100;
287 |
288 | $results = $s1->merge([$s2], ["discrepant_positions_threshold" => 0]);
289 | $this->assertCount(0, $s1->getDiscrepantMergePositions());
290 | $this->assertCount(0, $s1->getDiscrepantMergeGenotypes());
291 | $this->assertCount(0, $s1->getDiscrepantMergePositionsGenotypes());
292 |
293 | // Compare the 'snps' attribute of 's1' with the expected array directly
294 | $this->assertEquals($s1->getSnps(), self::genericSnps());
295 |
296 | // Check the results of the merge operation
297 | $expectedResults = [[]];
298 | $this->assertResults($results, $expectedResults);
299 | }
300 |
301 | public function testMergeExceedDiscrepantGenotypesThreshold()
302 | {
303 | $s1 = new SNPs("tests/input/generic.csv");
304 | $s2 = new SNPs("tests/input/generic.csv");
305 | $s2->getSnps()["rs1"]["genotype"] = "CC";
306 |
307 | $results = $s1->merge([$s2], ["discrepant_genotypes_threshold" => 0]);
308 | $this->assertCount(0, $s1->getDiscrepantMergePositions());
309 | $this->assertCount(0, $s1->getDiscrepantMergeGenotypes());
310 | $this->assertCount(0, $s1->getDiscrepantMergePositionsGenotypes());
311 |
312 | // Compare the 'snps' attribute of 's1' with the expected array directly
313 | $this->assertEquals($s1->getSnps(), self::genericSnps());
314 |
315 | // Check the results of the merge operation
316 | $expectedResults = [[]];
317 | $this->assertResults($results, $expectedResults);
318 | }
319 |
320 | public function testMergingFilesDiscrepantSnps()
321 | {
322 | $tmpDir = sys_get_temp_dir();
323 | $dest1 = $tmpDir . "/discrepant_snps1.csv";
324 | $dest2 = $tmpDir . "/discrepant_snps2.csv";
325 |
326 | // Read the CSV file
327 | $csv = Reader::createFromPath("tests/input/discrepant_snps.csv", "r");
328 | $csv->setHeaderOffset(1);
329 | $records = $csv->getRecords();
330 |
331 | // Create arrays for the first and second CSV files
332 | $file1Data = [];
333 | $file2Data = [];
334 | foreach ($records as $record) {
335 | $file1Data[] = [
336 | "chromosome" => $record["chrom"],
337 | "position" => $record["pos_file1"],
338 | "genotype" => $record["genotype_file1"],
339 | ];
340 | $file2Data[] = [
341 | "chromosome" => $record["chrom"],
342 | "position" => $record["pos_file2"],
343 | "genotype" => $record["genotype_file2"],
344 | ];
345 | }
346 |
347 | // Write arrays to CSV files
348 | $file1Writer = Writer::createFromPath($dest1, "w");
349 | $file1Writer->insertOne(["chromosome", "position", "genotype"]);
350 | $file1Writer->insertAll($file1Data);
351 |
352 | $file2Writer = Writer::createFromPath($dest2, "w");
353 | $file2Writer->insertOne(["chromosome", "position", "genotype"]);
354 | $file2Writer->insertAll($file2Data);
355 |
356 | $s = new SNPs();
357 | $s->merge([new SNPs($dest1), new SNPs($dest2)]);
358 |
359 | // Expected data
360 | $expected = [];
361 | foreach ($records as $record) {
362 | $expected[] = [
363 | "chromosome" => $record["chrom"],
364 | "discrepant_position" => $record["discrepant_position"],
365 | "discrepant_genotype" => $record["discrepant_genotype"],
366 | "pos" => $record["expected_position"],
367 | "genotype" => $record["expected_genotype"],
368 | ];
369 | }
370 |
371 | // Create an SNPs object from the expected data
372 | $expectedSNPs = new SNPs();
373 | $expectedSNPs->setSnps($expected);
374 | $expectedSNPs->sort();
375 | $expected = $expectedSNPs->getSnps();
376 |
377 | // Assert results
378 | $this->assertCount(count($expected), $s->getDiscrepantMergePositions());
379 | $this->assertCount(count($expected), $s->getDiscrepantMergeGenotypes());
380 | $this->assertArrayHasKey("pos", $s->getSnps());
381 | $this->assertArrayHasKey("genotype", $s->getSnps());
382 |
383 | // Perform comparisons
384 | foreach ($expected as $key => $value) {
385 | $this->assertEquals($value["discrepant_position"], $s->getDiscrepantMergePositions()[$key]);
386 | $this->assertEquals($value["discrepant_genotype"], $s->getDiscrepantMergeGenotypes()[$key]);
387 | $this->assertEquals($value["pos"], $s->getSnps()[$key]["pos"]);
388 | $this->assertEquals($value["genotype"], $s->getSnps()[$key]["genotype"]);
389 | }
390 | }
391 |
392 | public function testAppendingDfs()
393 | {
394 | $s = new SNPs();
395 | $s->setSnps([
396 | ["rsid" => "rs1", "chrom" => "1", "pos" => 1, "genotype" => "AA"],
397 | ]);
398 | $s->setDuplicate([
399 | ["rsid" => "rs1", "chrom" => "1", "pos" => 1, "genotype" => "AA"],
400 | ]);
401 | $s->setDiscrepantXY([
402 | ["rsid" => "rs1", "chrom" => "1", "pos" => 1, "genotype" => "AA"],
403 | ]);
404 |
405 | $s->merge([$s]);
406 |
407 | $df = [
408 | ["rsid" => "rs1", "chrom" => "1", "pos" => 1, "genotype" => "AA"],
409 | ["rsid" => "rs1", "chrom" => "1", "pos" => 1, "genotype" => "AA"],
410 | ];
411 |
412 | $this->assertEquals($df, $s->getDuplicate());
413 | $this->assertEquals($df, $s->getDiscrepantXY());
414 | $this->assertEquals([], $s->getHeterozygousMT());
415 | $this->assertEquals([], $s->getDiscrepantVcfPosition());
416 | }
417 |
418 | public function testMergeChrom()
419 | {
420 | $s1 = new SNPs("tests/input/generic.csv");
421 | $s2 = new SNPs();
422 | $s2->setBuild(37);
423 |
424 | $snpData = [
425 | ["rsid" => "rs100", "chrom" => "Y", "pos" => 100, "genotype" => "A"],
426 | ["rsid" => "rs101", "chrom" => "Y", "pos" => 101, "genotype" => null],
427 | ["rsid" => "rs102", "chrom" => "Y", "pos" => 102, "genotype" => "A"],
428 | ["rsid" => "rs103", "chrom" => "Y", "pos" => 103, "genotype" => "A"],
429 | ];
430 |
431 | $s1->setSnps(array_merge($s1->getSnps(), $snpData));
432 | $s2->setSnps(array_merge($s2->getSnps(), $snpData));
433 |
434 | // Set values for chrom that will be ignored
435 | $s2->setSnpsValue("rs3", "pos", 1003); // Discrepant position
436 | $s2->setSnpsValue("rs4", "genotype", "AA"); // Discrepant genotype
437 | $s2->setSnpsValue("rs5", "genotype", "AA");
438 |
439 | // Set values for chrom to be merged
440 | $s2->setSnpsValue("rs100", "genotype", "T"); // Discrepant genotype
441 | $s2->setSnpsValue("rs101", "genotype", "A");
442 | $s2->setSnpsValue("rs102", "pos", 1002); // Discrepant position
443 |
444 | // Set expected values for merge result
445 | $s1->setSnpsValue("rs100", "genotype", null); // Discrepant genotype sets to null
446 | $s1->setSnpsValue("rs101", "genotype", "A"); // Updates null
447 |
448 | $results = $s1->merge([$s2], "Y");
449 |
450 | $this->assertEquals($s1->getSnps(), $s1->getSnps());
451 |
452 | $expectedResults = [
453 | [
454 | "merged" => true,
455 | "common_rsids" => ["rs100", "rs101", "rs102", "rs103"],
456 | "discrepant_position_rsids" => ["rs102"],
457 | "discrepant_genotype_rsids" => ["rs100"],
458 | ]
459 | ];
460 |
461 | $this->assertEquals($expectedResults, $results);
462 |
463 | $this->assertEquals(count($s1->getDiscrepantMergePositions()), 1);
464 | $this->assertEquals(count($s1->getDiscrepantMergeGenotypes()), 1);
465 | }
466 | }
467 |
--------------------------------------------------------------------------------
/tests/Snps/SnpsTest.php:
--------------------------------------------------------------------------------
1 | assertEquals(count($s), 8);
30 | }
31 |
32 | public function test_len_empty()
33 | {
34 | foreach (self::empty_snps() as $s) {
35 | $this->assertEquals(count($s), 0);
36 | }
37 | }
38 |
39 | public function test__toString()
40 | {
41 | $s = new SNPs("tests/input/GRCh37.csv");
42 | $this->assertEquals("SNPs('GRCh37.csv')", $s->__toString());
43 | }
44 |
45 | public function test__toString_bytes()
46 | {
47 | $data = file_get_contents("tests/input/GRCh37.csv");
48 | $s = new SNPs($data);
49 | $this->assertEquals("SNPs()", $s->__toString());
50 | }
51 |
52 | public function testAssembly()
53 | {
54 | $s = new SNPs("tests/input/GRCh38.csv");
55 | $this->assertEquals($s->getAssembly(), "GRCh38");
56 | }
57 |
58 | public function testAssemblyNoSnps()
59 | {
60 | $emptySnps = $this->empty_snps();
61 |
62 | foreach ($emptySnps as $snps) {
63 | $this->assertEmpty($snps->getAssembly());
64 | }
65 | }
66 |
67 | public function testBuild()
68 | {
69 | $s = new SNPs("tests/input/NCBI36.csv");
70 | $this->assertEquals($s->getBuild(), 36);
71 | $this->assertEquals($s->getAssembly(), "NCBI36");
72 | }
73 |
74 | public function testBuildDetectedNoSnps()
75 | {
76 | $emptySnps = $this->empty_snps();
77 |
78 | foreach ($emptySnps as $snps) {
79 | $this->assertFalse($snps->isBuildDetected());
80 | }
81 | }
82 |
83 | public function testBuildNoSnps()
84 | {
85 | $emptySnps = $this->empty_snps();
86 |
87 | foreach ($emptySnps as $snps) {
88 | $this->assertEmpty($snps->getBuild());
89 | }
90 | }
91 |
92 | public function testBuildDetectedPARSnps()
93 | {
94 | $snps = $this->loadAssignPARSnps('tests/input/GRCh37_PAR.csv');
95 | $this->assertEquals(37, $snps->getBuild());
96 | $this->assertTrue($snps->isBuildDetected());
97 | $expectedSnps = $this->snps_GRCh37_PAR();
98 | $actualSnps = $snps->getSnps();
99 | $this->assertEquals($expectedSnps, $actualSnps);
100 | }
101 |
102 | public function test_notnull()
103 | {
104 | $s = new SNPs("tests/input/generic.csv");
105 | $snps = $this->generic_snps();
106 | unset($snps["rs5"]);
107 |
108 | $this->assertEquals($s->notnull(), $snps, "Frames are not equal!");
109 | }
110 |
111 | public function test_heterozygous()
112 | {
113 | $s = new SNPs("tests/input/generic.csv");
114 |
115 | $expected = $this->create_snp_df(
116 | rsid: ["rs6", "rs7", "rs8"],
117 | chrom: ["1", "1", "1"],
118 | pos: [106, 107, 108],
119 | genotype: ["GC", "TC", "AT"]
120 | );
121 |
122 | $this->assertEquals($expected, $s->heterozygous(), "Frames are not equal!");
123 | }
124 |
125 | public function test_homozygous()
126 | {
127 | $s = new SNPs("tests/input/generic.csv");
128 |
129 | $expected = $this->create_snp_df(
130 | rsid: ["rs1", "rs2", "rs3", "rs4"],
131 | chrom: ["1", "1", "1", "1"],
132 | pos: [101, 102, 103, 104],
133 | genotype: ["AA", "CC", "GG", "TT"],
134 | );
135 |
136 | $this->assertEquals($expected, $s->homozygous(), "Frames are not equal!");
137 | }
138 |
139 | public function test_valid_False()
140 | {
141 | foreach ($this->empty_snps() as $snps) {
142 | $this->assertFalse($snps->isValid());
143 | }
144 | }
145 |
146 | public function test_valid_True()
147 | {
148 | $s = new SNPs("tests/input/generic.csv");
149 | $this->assertTrue($s->isValid());
150 | }
151 |
152 | public function test_only_detect_source()
153 | {
154 | $s = new SNPs("tests/input/generic.csv", true);
155 | $this->assertEquals($s->getSource(), "generic");
156 | $this->assertEquals(count($s), 0);
157 | }
158 |
159 | public function test_summary()
160 | {
161 | $s = new SNPs("tests/input/GRCh38.csv");
162 | $this->assertEquals(
163 | $s->getSummary(),
164 | [
165 | "source" => "generic",
166 | "assembly" => "GRCh38",
167 | "build" => 38,
168 | "build_detected" => true,
169 | "count" => 4,
170 | "chromosomes" => "1, 3",
171 | "sex" => "",
172 | ]
173 | );
174 | }
175 |
176 | public function test_summary_no_snps()
177 | {
178 | foreach ($this->empty_snps() as $snps) {
179 | $this->assertEquals($snps->getSummary(), []);
180 | }
181 | }
182 |
183 | public function test_chromosomes()
184 | {
185 | $s = new SNPs("tests/input/chromosomes.csv");
186 | $this->assertEquals(["1", "2", "3", "5", "PAR", "MT"], $s->getChromosomes());
187 | }
188 |
189 | public function test_chromosomes_no_snps()
190 | {
191 | foreach ($this->empty_snps() as $snps) {
192 | $this->assertEmpty($snps->getChromosomes());
193 | }
194 | }
195 |
196 | public function test_sex_Female_X_chrom()
197 | {
198 | $s = $this->simulate_snps(
199 | chrom: "X",
200 | pos_start: 1,
201 | pos_max: 155270560,
202 | pos_step: 10000,
203 | genotype: "AC"
204 | );
205 | $this->assertEquals("Female", $s->getSex());
206 | }
207 |
208 | public function test_sex_Female_Y_chrom()
209 | {
210 | $s = $this->simulate_snps(
211 | chrom: "Y",
212 | pos_start: 1,
213 | pos_max: 59373566,
214 | pos_step: 10000,
215 | null_snp_step: 1
216 | );
217 | $this->assertEquals("Female", $s->getSex());
218 | }
219 |
220 | public function test_sex_Male_X_chrom()
221 | {
222 | $s = $this->simulate_snps(
223 | chrom: "X",
224 | pos_start: 1,
225 | pos_max: 155270560,
226 | pos_step: 10000,
227 | genotype: "AA"
228 | );
229 | $this->assertEquals(15528, $s->count());
230 | $s->deduplicateXYChrom();
231 | $this->assertEquals(15528, $s->count());
232 | $this->assertEquals(0, count($s->getDiscrepantXY()));
233 | $this->assertEquals("Male", $s->getSex());
234 | }
235 |
236 | public function test_sex_Male_X_chrom_discrepant_XY()
237 | {
238 | $s = $this->simulate_snps(
239 | chrom: "X",
240 | pos_start: 1,
241 | pos_max: 155270560,
242 | pos_step: 10000,
243 | genotype: "AA"
244 | );
245 | $this->assertEquals(15528, $s->count());
246 | $s->setValue("rs8001", "genotype", "AC");
247 | $s->deduplicateXYChrom();
248 | $this->assertEquals(15527, $s->count());
249 | $result = $this->create_snp_df(
250 | rsid: ["rs8001"],
251 | chrom: ["X"],
252 | pos: [80000001],
253 | genotype: ["AC"]
254 | );
255 | $this->assertEquals($result, $s->getDiscrepantXY());
256 | $this->assertEquals("Male", $s->getSex());
257 | }
258 |
259 | public function test_sex_male_Y_chrom()
260 | {
261 | $s = $this->simulate_snps(
262 | chrom: "Y",
263 | pos_start: 1,
264 | pos_max: 59373566,
265 | pos_step: 10000
266 | );
267 |
268 | $this->assertEquals("Male", $s->getSex());
269 | }
270 |
271 | public function test_sex_not_determined()
272 | {
273 | $s = $this->simulate_snps(
274 | chrom: "1",
275 | pos_start: 1,
276 | pos_max: 249250621,
277 | pos_step: 10000
278 | );
279 |
280 | $this->assertEquals("", $s->getSex());
281 | }
282 |
283 | public function test_sex_no_snps()
284 | {
285 | foreach ($this->empty_snps() as $snps) {
286 | $this->assertEmpty($snps->getSex());
287 | }
288 | }
289 |
290 | public function test_source()
291 | {
292 | $s = new SNPs("tests/input/generic.csv");
293 | $this->assertEquals("generic", $s->getSource());
294 | $this->assertEquals(["generic"], $s->getAllSources());
295 | }
296 |
297 | public function test_source_no_snps()
298 | {
299 | foreach ($this->empty_snps() as $snps) {
300 | $this->assertEmpty($snps->getSource());
301 | }
302 | }
303 |
304 | public function test_count()
305 | {
306 | $s = new SNPs("tests/input/NCBI36.csv");
307 | $this->assertEquals(4, $s->count());
308 | }
309 |
310 | public function test_count_no_snps()
311 | {
312 | foreach ($this->empty_snps() as $snps) {
313 | $this->assertEquals(0, $snps->count());
314 | $this->assertEmpty($snps->getSnps());
315 | }
316 | }
317 |
318 | public function testDeduplicateFalse()
319 | {
320 | $snps = new SNPs("tests/input/duplicate_rsids.csv", deduplicate: false);
321 | $result = $this->create_snp_df(["rs1", "rs1", "rs1"], ["1", "1", "1"], [101, 102, 103], ["AA", "CC", "GG"]);
322 | $this->assertEquals($result, $snps->getSnps());
323 | }
324 |
325 | public function testDeduplicateMTChrom()
326 | {
327 | $snps = new SNPs("tests/input/ancestry_mt.txt");
328 | $result = $this->create_snp_df(["rs1", "rs2"], ["MT", "MT"], [101, 102], ["A", null]);
329 | $this->assertEquals($result, $snps->getSnps());
330 |
331 | $heterozygousMTSnps = $this->create_snp_df(["rs3"], ["MT"], [103], ["GC"]);
332 | $this->assertEquals($heterozygousMTSnps, $snps->getHeterozygousMT());
333 | }
334 |
335 | public function testDeduplicateMTChromFalse()
336 | {
337 | $snps = new SNPs("tests/input/ancestry_mt.txt", deduplicate: false);
338 | $result = $this->create_snp_df(["rs1", "rs2", "rs3"], ["MT", "MT", "MT"], [101, 102, 103], ["AA", null, "GC"]);
339 | $this->assertEquals($result, $snps->getSnps());
340 | }
341 |
342 | public function testDuplicateRsids()
343 | {
344 | $snps = new SNPs("tests/input/duplicate_rsids.csv");
345 | $result = $this->create_snp_df(["rs1"], ["1"], [101], ["AA"]);
346 | $duplicate = $this->create_snp_df(["rs1", "rs1"], ["1", "1"], [102, 103], ["CC", "GG"]);
347 | $this->assertEquals($result, $snps->getSnps());
348 | $this->assertEquals($duplicate, $snps->getDuplicate());
349 | }
350 |
351 | public function testRemap36to37()
352 | {
353 | $this->_run_remap_test(function () {
354 | $s = new SNPs("tests/input/NCBI36.csv");
355 | list($chromosomes_remapped, $chromosomes_not_remapped) = $s->remap(37);
356 | $this->assertEquals(37, $s->getBuild());
357 | $this->assertEquals("GRCh37", $s->getAssembly());
358 | $this->assertCount(2, $chromosomes_remapped);
359 | $this->assertCount(0, $chromosomes_not_remapped);
360 | $this->assertEquals($this->snps_GRCh37(), $s->getSnps());
361 | }, $this->NCBI36_GRCh37());
362 | }
363 |
364 | public function testRemap37to36()
365 | {
366 | $this->_run_remap_test(function () {
367 | $s = new SNPs("tests/input/GRCh37.csv");
368 | list($chromosomes_remapped, $chromosomes_not_remapped) = $s->remap(36);
369 | $this->assertEquals(36, $s->getBuild());
370 | $this->assertEquals("NCBI36", $s->getAssembly());
371 | $this->assertCount(2, $chromosomes_remapped);
372 | $this->assertCount(0, $chromosomes_not_remapped);
373 | $this->assertEquals($this->snps_NCBI36(), $s->getSnps());
374 | }, $this->GRCh37_NCBI36());
375 | }
376 |
377 | public function testRemap37to38()
378 | {
379 | $this->_run_remap_test(function () {
380 | $s = new SNPs("tests/input/GRCh37.csv");
381 | list($chromosomes_remapped, $chromosomes_not_remapped) = $s->remap(38);
382 | $this->assertEquals(38, $s->getBuild());
383 | $this->assertEquals("GRCh38", $s->getAssembly());
384 | $this->assertCount(2, $chromosomes_remapped);
385 | $this->assertCount(0, $chromosomes_not_remapped);
386 | $this->assertEquals($this->snps_GRCh38(), $s->getSnps());
387 | }, $this->GRCh37_GRCh38());
388 | }
389 |
390 | public function testRemap37to37()
391 | {
392 | $s = new SNPs("tests/input/GRCh37.csv");
393 | list($chromosomes_remapped, $chromosomes_not_remapped) = $s->remap(37);
394 | $this->assertEquals(37, $s->getBuild());
395 | $this->assertEquals("GRCh37", $s->getAssembly());
396 | $this->assertCount(0, $chromosomes_remapped);
397 | $this->assertCount(2, $chromosomes_not_remapped);
398 | $this->assertEquals($this->snps_GRCh37(), $s->getSnps());
399 | }
400 |
401 | public function testRemapInvalidAssembly()
402 | {
403 | $s = new SNPs("tests/input/GRCh37.csv");
404 | list($chromosomes_remapped, $chromosomes_not_remapped) = $s->remap(-1);
405 | $this->assertEquals(37, $s->getBuild());
406 | $this->assertEquals("GRCh37", $s->getAssembly());
407 | $this->assertCount(0, $chromosomes_remapped);
408 | $this->assertCount(2, $chromosomes_not_remapped);
409 | }
410 |
411 | public function testRemapNoSnps()
412 | {
413 | $s = new SNPs();
414 | list($chromosomes_remapped, $chromosomes_not_remapped) = $s->remap(38);
415 | $this->assertFalse($s->getBuild());
416 | $this->assertCount(0, $chromosomes_remapped);
417 | $this->assertCount(0, $chromosomes_not_remapped);
418 | }
419 |
420 | public function testSaveToTsv()
421 | {
422 | $s = new SNPs("tests/input/generic.csv");
423 | $tempFile = tempnam(sys_get_temp_dir(), 'snps_test');
424 | $s->toTsv($tempFile);
425 | $content = file_get_contents($tempFile);
426 | $this->assertStringStartsWith("# Generated by snps", $content);
427 | unlink($tempFile);
428 | }
429 |
430 | public function testSaveNoSNPs()
431 | {
432 | $s = new SNPs();
433 | $this->assertFalse($s->toTsv());
434 | }
435 |
436 | public function testSaveNoSNPsVCF()
437 | {
438 | $s = new SNPs();
439 | $this->assertFalse($s->toVcf());
440 | }
441 |
442 | public function testSaveSource()
443 | {
444 | $tmpdir = sys_get_temp_dir();
445 | $s = new SNPs("tests/input/GRCh38.csv", outputDir: $tmpdir);
446 | $dest = $tmpdir . DIRECTORY_SEPARATOR . "generic_GRCh38.txt";
447 | $this->assertEquals($s->toTsv(), $dest);
448 | $snps = new SNPs($dest);
449 | $this->assertEquals($snps->getBuild(), 38);
450 | $this->assertTrue($snps->isBuildDetected());
451 | $this->assertEquals($snps->getSource(), "generic");
452 | $this->assertEquals($snps->getAllSources(), ["generic"]);
453 | $this->assertEquals($this->snps_GRCh38(), $snps->getSnps());
454 | }
455 |
456 | public function testCluster()
457 | {
458 | $this->runClusterTest(function ($mock) {
459 | $s = new SNPs("tests/input/23andme.txt", resources: $mock);
460 | $this->assertEquals($s->getCluster(), "c1");
461 | }, $this->getChipClusters());
462 | }
463 |
464 | public function testChip()
465 | {
466 | $this->runClusterTest(function ($mock) {
467 | $s = new SNPs("tests/input/23andme.txt", resources: $mock);
468 | $this->assertEquals($s->getChip(), "HTS iSelect HD");
469 | }, $this->_getChipClusters());
470 | }
471 |
472 | public function testChipVersion()
473 | {
474 | $this->runClusterTest(function ($mock) {
475 | $s = new SNPs("tests/input/23andme.txt", resources: $mock);
476 | $this->assertEquals($s->getChipVersion(), "v4");
477 | }, $this->getChipClusters());
478 | }
479 |
480 | public function testComputeClusterOverlap()
481 | {
482 | $this->runClusterTest(function ($mock) {
483 | $s = new SNPs("tests/input/23andme.txt", resources: $mock);
484 | $result = $s->computeClusterOverlap();
485 | $this->assertEquals($s->getCluster(), "c1");
486 | $this->assertEquals($s->getChip(), "HTS iSelect HD");
487 | $this->assertEquals($s->getChipVersion(), "v4");
488 | $this->assertArrayHasKey("c1", $result);
489 | }, $this->_getChipClusters());
490 | }
491 |
492 | public function testSnpsQc()
493 | {
494 | $s = new SNPs("tests/input/generic.csv");
495 | $snpsQc = $s->getSnpsQc();
496 | $expectedQcSnps = $this->genericSnps();
497 | unset($expectedQcSnps['rs4']);
498 | unset($expectedQcSnps['rs6']);
499 | $this->assertEquals($expectedQcSnps, $snpsQc);
500 | }
501 |
502 | public function testLowQuality()
503 | {
504 | $s = new SNPs("tests/input/generic.csv");
505 | $lowQualitySnps = $s->getLowQualitySnps();
506 | $expectedLowQualitySnps = $this->genericSnps();
507 | $this->assertEquals($expectedLowQualitySnps, $lowQualitySnps);
508 | }
509 |
510 | // Add more tests for SNPData and SNPAnalyzer classes
511 | public function testSNPData()
512 | {
513 | $snpData = new SNPData($this->genericSnps());
514 | $this->assertEquals(8, $snpData->count());
515 | $this->assertEquals(["1"], $snpData->getChromosomes());
516 | }
517 |
518 | public function testSNPAnalyzer()
519 | {
520 | $buildDetector = $this->createMock(BuildDetector::class);
521 | $buildDetector->method('detectBuild')->willReturn(37);
522 |
523 | $clusterOverlapCalculator = $this->createMock(ClusterOverlapCalculator::class);
524 | $clusterOverlapCalculator->method('computeClusterOverlap')->willReturn(['cluster' => 'c1']);
525 |
526 | $snpAnalyzer = new SNPAnalyzer($buildDetector, $clusterOverlapCalculator);
527 | $snpData = new SNPData($this->genericSnps());
528 |
529 | $this->assertEquals(37, $snpAnalyzer->detectBuild($snpData));
530 | $this->assertEquals(['cluster' => 'c1'], $snpAnalyzer->computeClusterOverlap($snpData));
531 | $this->assertEquals('Female', $snpAnalyzer->determineSex($snpData));
532 | }
533 | }
534 |
--------------------------------------------------------------------------------
/src/Snps/VariedicInherit.php:
--------------------------------------------------------------------------------
1 | config = $config;
42 | $required = [
43 | self::KEY_CALLBACK,
44 | self::KEY_REMOVED,
45 | self::KEY_MAGIC,
46 | self::KEY_RESOURCE,
47 | ];
48 |
49 | foreach ($required as $key) {
50 | if (!isset($this->config[$key])) {
51 | $message = sprintf(self::ERR_MISSING_KEY, $key);
52 | throw new InvalidArgumentException($message);
53 | }
54 | }
55 | }
56 |
57 | /**
58 | * Get the contents of a file.
59 | *
60 | * @param string $filePath Path to the file to scan
61 | * @return string The file contents with line breaks replaced by spaces
62 | * @throws InvalidArgumentException If the file is not found
63 | */
64 | public function getFileContents(string $filePath): string
65 | {
66 | if (!file_exists($filePath)) {
67 | $this->contents = '';
68 | throw new InvalidArgumentException(
69 | sprintf(self::ERR_FILE_NOT_FOUND, $filePath)
70 | );
71 | }
72 |
73 | $this->clearMessages();
74 | $this->contents = file_get_contents($filePath);
75 | $this->contents = str_replace(["\r", "\n"], ['', ' '], $this->contents);
76 |
77 | return $this->contents;
78 | }
79 |
80 | /**
81 | * Extracts the value immediately following the supplied word up until the supplied end
82 | *
83 | * @param string $contents : text to search (usually $this->contents)
84 | * @param string $key : starting keyword or set of characters
85 | * @param string $delim : ending delimiter
86 | * @return string $name : classnames
87 | */
88 | /**
89 | * Get the value of a key from a string.
90 | *
91 | * @param string $contents The string to search
92 | * @param string $key The key to search for
93 | * @param string $delimiter The delimiter to use
94 | * @return string The value of the key, or an empty string if not found
95 | */
96 | public static function getKeyValue(
97 | string $contents,
98 | string $key,
99 | string $delimiter
100 | ): string {
101 | $position = strpos($contents, $key);
102 |
103 | if ($position === false) {
104 | return '';
105 | }
106 |
107 | $end = strpos($contents, $delimiter, $position + strlen($key) + 1);
108 | $value = substr(
109 | $contents,
110 | $position + strlen($key),
111 | $end - $position - strlen($key)
112 | );
113 |
114 | return is_string($value) ? trim($value) : '';
115 | }
116 |
117 | /**
118 | * Clears messages
119 | *
120 | * @return void
121 | */
122 | public function clearMessages() : void
123 | {
124 | $this->messages = [];
125 | $this->magic = [];
126 | }
127 |
128 | /**
129 | * Returns messages
130 | *
131 | * @param bool $clear : If TRUE, reset messages to []
132 | * @return array $messages : accumulated messages
133 | */
134 | public function getMessages(bool $clear = FALSE) : array
135 | {
136 | $messages = $this->messages;
137 | if ($clear) $this->clearMessages();
138 | return $messages;
139 | }
140 |
141 | /**
142 | * Returns 0 and adds OK message
143 | *
144 | * @param string $function
145 | * @return int 0
146 | */
147 | public function passedOK(string $function) : int
148 | {
149 | $this->messages[] = sprintf(self::OK_PASSED, $function);
150 | return 0;
151 | }
152 |
153 | /**
154 | * Runs all scans
155 | *
156 | * @return int $found : number of potential BC breaks found
157 | */
158 | public function runAllScans() : int
159 | {
160 | $found = 0;
161 | $found += $this->scanRemovedFunctions();
162 | $found += $this->scanIsResource();
163 | $found += $this->scanMagicSignatures();
164 | echo __METHOD__ . ':' . var_export($this->messages, TRUE) . "\n";
165 | $found += $this->scanFromCallbacks();
166 | return $found;
167 | }
168 | /**
169 | * Check for removed functions
170 | *
171 | * @return int $found : number of BC breaks detected
172 | */
173 | public function scanRemovedFunctions() : int
174 | {
175 | $found = 0;
176 | $config = $this->config[self::KEY_REMOVED] ?? NULL;
177 | // we add this extra safety check in case this method is called separately
178 | if (empty($config)) {
179 | $message = sprintf(self::ERR_MISSING_KEY, self::KEY_REMOVED);
180 | throw new Exception($message);
181 | }
182 | foreach ($config as $func => $replace) {
183 | $search1 = ' ' . $func . '(';
184 | $search2 = ' ' . $func . ' (';
185 | if (strpos($this->contents, $search1) !== FALSE
186 | || strpos($this->contents, $search2) !== FALSE) {
187 | $this->messages[] = sprintf(self::ERR_REMOVED, $func, $replace);
188 | $found++;
189 | }
190 | }
191 | return ($found === 0) ? $this->passedOK(__FUNCTION__) : $found;
192 | }
193 | /**
194 | * Check for is_resource usage
195 | * If "is_resource" found, check against list of functions
196 | * that no longer produce resources in PHP 8
197 | *
198 | * @return int $found : number of BC breaks detected
199 | */
200 | public function scanIsResource() : int
201 | {
202 | $found = 0;
203 | $search = 'is_resource';
204 | // if "is_resource" not found discontinue search
205 | if (strpos($this->contents, $search) === FALSE) return $this->passedOK(__FUNCTION__);
206 | // pull list of functions that now return objects instead of resources
207 | $config = $this->config[self::KEY_RESOURCE] ?? NULL;
208 | // we add this extra safety check in case this method is called separately
209 | if (empty($config)) {
210 | $message = sprintf(self::ERR_MISSING_KEY, self::KEY_RESOURCE);
211 | throw new Exception($message);
212 | }
213 | foreach ($config as $func) {
214 | if ((strpos($this->contents, $func) !== FALSE)) {
215 | $this->messages[] = sprintf(self::ERR_IS_RESOURCE, $func);
216 | $found++;
217 | }
218 | }
219 | return ($found === 0) ? $this->passedOK(__FUNCTION__) : $found;
220 | }
221 | /**
222 | * Scan for magic method signatures
223 | * NOTE: doesn't check inside parentheses.
224 | * only checks for return data type + displays found and correct signatures for manual comparison
225 | *
226 | * @return int $found : number of invalid return data types
227 | */
228 | public function scanMagicSignatures() : int
229 | {
230 | // locate all magic methods
231 | $found = 0;
232 | $matches = [];
233 |
234 | if (!empty($matches[1])) {
235 | $this->messages[] = self::MAGIC_METHODS;
236 | $config = $this->config[self::KEY_MAGIC] ?? NULL;
237 | // we add this extra safety check in case this method is called separately
238 | if (empty($config)) {
239 | $message = sprintf(self::ERR_MISSING_KEY, self::KEY_MAGIC);
240 | throw new Exception($message);
241 | }
242 | foreach ($matches[1] as $name) {
243 | $key = '__' . $name;
244 | // skip if key not found. must not be a defined magic method
245 | if (!isset($config[$key])) continue;
246 | // record official signature
247 | $this->messages[] = 'Signature: ' . ($config[$key]['signature'] ?? 'Signature not found');
248 | $sub = $this->getKeyValue($this->contents, $key, '{');
249 | if ($sub) {
250 | $sub = $key . $sub;
251 | // record found signature
252 | $this->messages[] = 'Actual : ' . $sub;
253 | // look for return type
254 | if (strpos($sub, ':')) {
255 | $ptn = '/.*?\(.*?\)\s*:\s*' . $config[$key]['return'] . '/';
256 | // test for a match
257 | if (!preg_match($ptn, $sub)) {
258 | $this->messages[] = sprintf(self::ERR_MAGIC_SIGNATURE, $key);
259 | $found++;
260 | }
261 | }
262 | }
263 | }
264 | }
265 | //echo __METHOD__ . ':' . var_export($this->messages, TRUE) . "\n";
266 | return ($found === 0) ? $this->passedOK(__FUNCTION__) : $found;
267 | }
268 | /**
269 | * Runs all scans key as defined in $this->config (bc_break_scanner.config.php)
270 | *
271 | * @return int $found : number of potential BC breaks found
272 | */
273 | public function scanFromCallbacks()
274 | {
275 | $found = 0;
276 | $list = array_keys($this->config[self::KEY_CALLBACK]);
277 | foreach ($list as $key) {
278 | $config = $this->config[self::KEY_CALLBACK][$key] ?? NULL;
279 | if (empty($config['callback']) || !is_callable($config['callback'])) {
280 | $message = sprintf(self::ERR_INVALID_KEY, self::KEY_CALLBACK . ' => ' . $key . ' => callback');
281 | throw new InvalidArgumentException($message);
282 | }
283 | if ($config['callback']($this->contents)) {
284 | $this->messages[] = $config['msg'];
285 | $found++;
286 | }
287 | }
288 | return $found;
289 | }
290 |
291 | /**
292 | * Get homozygous SNPs for a given chromosome.
293 | *
294 | * @param string $chromosome The chromosome to get homozygous SNPs for
295 | * @return mixed The result of the homozygous() method
296 | * @deprecated Use the homozygous() method instead
297 | */
298 | public function homozygous_snps(string $chromosome = '')
299 | {
300 | trigger_error(
301 | 'This method has been renamed to `homozygous`.',
302 | E_USER_DEPRECATED
303 | );
304 |
305 | return $this->homozygous($chromosome);
306 | }
307 |
308 | /**
309 | * Check if the object is valid.
310 | *
311 | * @return bool The value of the "valid" property
312 | * @deprecated Use the "valid" property instead
313 | */
314 | public function is_valid(): bool
315 | {
316 | trigger_error(
317 | 'This method has been renamed to `valid` and is now a property.',
318 | E_USER_DEPRECATED
319 | );
320 |
321 | return $this->valid;
322 | }
323 |
324 | /**
325 | * Predict ancestry using the ezancestry package.
326 | *
327 | * @param string|null $outputDirectory The output directory for predictions
328 | * @param bool $writePredictions Whether to write the predictions to files
329 | * @param string|null $modelsDirectory The directory containing the models
330 | * @param string|null $aisnpsDirectory The directory containing the AIsnps
331 | * @param int|null $nComponents The number of components for the model
332 | * @param int|null $k The number of nearest neighbors to use
333 | * @param string|null $thousandGenomesDirectory The directory containing the 1000 Genomes data
334 | * @param string|null $samplesDirectory The directory containing the samples
335 | * @param string|null $algorithm The algorithm to use for prediction
336 | * @param string|null $aisnpsSet The set of AIsnps to use
337 | * @return array The predicted ancestry values
338 | * @throws Exception If the ezancestry package is not installed
339 | */
340 | public function predict_ancestry(
341 | ?string $outputDirectory = null,
342 | bool $writePredictions = false,
343 | ?string $modelsDirectory = null,
344 | ?string $aisnpsDirectory = null,
345 | ?int $nComponents = null,
346 | ?int $k = null,
347 | ?string $thousandGenomesDirectory = null,
348 | ?string $samplesDirectory = null,
349 | ?string $algorithm = null,
350 | ?string $aisnpsSet = null
351 | ): array {
352 | return $this->getPredictions(
353 | $outputDirectory,
354 | $writePredictions,
355 | $modelsDirectory,
356 | $aisnpsDirectory,
357 | $nComponents,
358 | $k,
359 | $thousandGenomesDirectory,
360 | $samplesDirectory,
361 | $algorithm,
362 | $aisnpsSet
363 | );
364 | }
365 |
366 | /**
367 | * Get ancestry predictions using the ezancestry package.
368 | *
369 | * @param string|null $outputDirectory The output directory for predictions
370 | * @param bool $writePredictions Whether to write the predictions to files
371 | * @param string|null $modelsDirectory The directory containing the models
372 | * @param string|null $aisnpsDirectory The directory containing the AIsnps
373 | * @param int|null $nComponents The number of components for the model
374 | * @param int|null $k The number of nearest neighbors to use
375 | * @param string|null $thousandGenomesDirectory The directory containing the 1000 Genomes data
376 | * @param string|null $samplesDirectory The directory containing the samples
377 | * @param string|null $algorithm The algorithm to use for prediction
378 | * @param string|null $aisnpsSet The set of AIsnps to use
379 | * @return array The predicted ancestry values
380 | * @throws Exception If the ezancestry package is not installed or the object is not valid
381 | */
382 | public function getPredictions(
383 | ?string $outputDirectory = null,
384 | bool $writePredictions = false,
385 | ?string $modelsDirectory = null,
386 | ?string $aisnpsDirectory = null,
387 | ?int $nComponents = null,
388 | ?int $k = null,
389 | ?string $thousandGenomesDirectory = null,
390 | ?string $samplesDirectory = null,
391 | ?string $algorithm = null,
392 | ?string $aisnpsSet = null
393 | ): array {
394 | if (!$this->valid) {
395 | return [];
396 | }
397 |
398 | if (!class_exists('ezancestry\commands\Predict')) {
399 | throw new Exception(
400 | 'Ancestry prediction requires the ezancestry package; please install it'
401 | );
402 | }
403 |
404 | $predict = new ezancestry\commands\Predict();
405 |
406 | $predictions = $predict->predict(
407 | $this->snps,
408 | $outputDirectory,
409 | $writePredictions,
410 | $modelsDirectory,
411 | $aisnpsDirectory,
412 | $nComponents,
413 | $k,
414 | $thousandGenomesDirectory,
415 | $samplesDirectory,
416 | $algorithm,
417 | $aisnpsSet
418 | );
419 |
420 | $maxPopValues = $this->maxPop($predictions[0]);
421 | $maxPopValues['ezancestry_df'] = $predictions;
422 |
423 | return $maxPopValues;
424 | }
425 |
426 | /**
427 | * Get the maximum population values from a prediction row.
428 | *
429 | * @param array $row The prediction row
430 | * @return array The maximum population values
431 | */
432 | private function maxPop(array $row): array
433 | {
434 | $populationCode = $row['predicted_population_population'];
435 | $populationDescription = $row['population_description'];
436 | $populationPercent = $row[$populationCode];
437 | $superpopulationCode = $row['predicted_population_superpopulation'];
438 | $superpopulationDescription = $row['superpopulation_name'];
439 | $superpopulationPercent = $row[$superpopulationCode];
440 |
441 | return [
442 | 'population_code' => $populationCode,
443 | 'population_description' => $populationDescription,
444 | '_percent' => $populationPercent,
445 | 'superpopulation_code' => $superpopulationCode,
446 | 'superpopulation_description' => $superpopulationDescription,
447 | 'population_percent' => $superpopulationPercent,
448 | ];
449 | }
450 |
451 | /**
452 | * Compute cluster overlap based on a given threshold.
453 | *
454 | * @param float $clusterOverlapThreshold The threshold for cluster overlap
455 | * @return DataFrame The computed cluster overlap DataFrame
456 | */
457 | public function computeClusterOverlap(float $clusterOverlapThreshold = 0.95): DataFrame
458 | {
459 | $data = [
460 | 'cluster_id' => ['c1', 'c3', 'c4', 'c5', 'v5'],
461 | 'company_composition' => [
462 | '23andMe-v4',
463 | 'AncestryDNA-v1, FTDNA, MyHeritage',
464 | '23andMe-v3',
465 | 'AncestryDNA-v2',
466 | '23andMe-v5, LivingDNA',
467 | ],
468 | 'chip_base_deduced' => [
469 | 'HTS iSelect HD',
470 | 'OmniExpress',
471 | 'OmniExpress plus',
472 | 'OmniExpress plus',
473 | 'Illumina GSAs',
474 | ],
475 | 'snps_in_cluster' => array_fill(0, 5, 0),
476 | 'snps_in_common' => array_fill(0, 5, 0),
477 | ];
478 |
479 | $df = new DataFrame($data);
480 | $df->setIndex('cluster_id');
481 |
482 | $toRemap = null;
483 |
484 | if ($this->build !== 37) {
485 | $toRemap = clone $this;
486 | $toRemap->remap(37);
487 | $selfSnps = $toRemap->snps()->select(['chrom', 'pos'])->dropDuplicates();
488 | } else {
489 | $selfSnps = $this->snps()->select(['chrom', 'pos'])->dropDuplicates();
490 | }
491 |
492 | $chipClusters = $this->resources->getChipClusters();
493 |
494 | foreach ($df->indexValues() as $cluster) {
495 | $clusterSnps = $chipClusters->filter(
496 | function ($row) use ($cluster) {
497 | return strpos($row['clusters'], $cluster) !== false;
498 | }
499 | )->select(['chrom', 'pos']);
500 |
501 | $df->loc[$cluster]['snps_in_cluster'] = count($clusterSnps);
502 | $df->loc[$cluster]['snps_in_common'] = count($selfSnps->merge($clusterSnps, 'inner'));
503 |
504 | $df['overlap_with_cluster'] = $df['snps_in_common'] / $df['snps_in_cluster'];
505 | $df['overlap_with_self'] = $df['snps_in_common'] / count($selfSnps);
506 |
507 | $maxOverlap = array_keys($df['overlap_with_cluster'], max($df['overlap_with_cluster']))[0];
508 |
509 | if (
510 | $df['overlap_with_cluster'][$maxOverlap] > $clusterOverlapThreshold
511 | && $df['overlap_with_self'][$maxOverlap] > $clusterOverlapThreshold
512 | ) {
513 | $this->cluster = $maxOverlap;
514 | $this->chip = $df['chip_base_deduced'][$maxOverlap];
515 |
516 | $companyComposition = $df['company_composition'][$maxOverlap];
517 |
518 | if (strpos($companyComposition, $this->source) !== false) {
519 | if ($this->source === '23andMe' || $this->source === 'AncestryDNA') {
520 | $i = strpos($companyComposition, 'v');
521 | $this->chip_version = substr($companyComposition, $i, $i + 2);
522 | }
523 | } else {
524 | // Log a warning about the SNPs data source not found
525 | }
526 | }
527 | }
528 |
529 | return $df;
530 | }
531 | }
--------------------------------------------------------------------------------