├── pgp ├── .gitignore ├── data-stories │ ├── schema-comparisons │ │ ├── .gitignore │ │ ├── figure │ │ │ ├── call_cnt.png │ │ │ ├── variant_cnt.png │ │ │ ├── sample_call_cnt.png │ │ │ └── sample_variant_cnt.png │ │ └── schema-comparison-observations.csv │ ├── comparing-pgp-to-1000genomes │ │ ├── .gitignore │ │ └── figure │ │ │ ├── variant counts-1.png │ │ │ ├── genotype heatmap-1.png │ │ │ ├── variant type counts-1.png │ │ │ └── pgp variant type counts-1.png │ ├── issues-with-the-variant-centric-approach │ │ └── .gitignore │ └── README.md ├── figure │ └── gender-1.png ├── sql │ ├── schema-comparisons │ │ ├── record-sample-counts.sql │ │ ├── missingness-klotho.sql │ │ ├── call-counts.sql │ │ ├── klotho-gvcf-expanded.sql │ │ ├── klotho-gvcf.sql │ │ ├── sample-call-counts.sql │ │ ├── missingness-brca1.sql │ │ └── missingness-udf.sql │ ├── comparing-pgp-to-1000genomes │ │ ├── genotype-counts.sql │ │ ├── sample-counts-minmax-by-chromosome.sql │ │ ├── variant-counts-by-chromosome.sql │ │ ├── parsed-genotype-counts.sql │ │ ├── taking-a-closer-look-at-variant-types.sql │ │ └── variant-counts-by-type-and-chromosome.sql │ ├── cgi_variants │ │ ├── klotho.sql │ │ ├── allelic-frequency-comparison.sql │ │ ├── allele-count.sql │ │ ├── ti-tv-ratio.sql │ │ ├── allelic-frequency.py │ │ ├── allelic-frequency-chr1.sql │ │ └── allelic-frequency-brca1.sql │ ├── gender-count.sql │ ├── issues-with-the-variant-centric-approach │ │ ├── klotho-summary.sql │ │ ├── factor-v-leiden.sql │ │ └── factor-v-leiden-summary.sql │ ├── gvcf_variants_expanded │ │ ├── klotho.sql │ │ ├── ti-tv-ratio.sql │ │ └── allelic-frequency.sql │ └── gvcf_variants │ │ ├── klotho.sql │ │ ├── allelic-frequency-comparison.sql │ │ ├── allelic-frequency.py │ │ ├── allele-count.sql │ │ ├── ti-tv-ratio.sql │ │ ├── allelic-frequency-brca1-no-udf.sql │ │ ├── allelic-frequency-chr1.sql │ │ └── allelic-frequency-brca1.sql ├── provenance │ ├── gvcf-expand-mapper.py │ ├── gvcf-expand-reducer.py │ ├── cgi-header-mapper.py │ ├── cgi-mapper.py │ └── cgi-ref-blocks-mapper.py ├── README.md └── README.Rmd ├── 1000genomes ├── .gitignore ├── data-stories │ ├── exploring-the-variant-data │ │ ├── .gitignore │ │ └── figure │ │ │ ├── unnamed-chunk-11-1.png │ │ │ ├── unnamed-chunk-13-1.png │ │ │ ├── unnamed-chunk-15-1.png │ │ │ ├── unnamed-chunk-16-1.png │ │ │ ├── unnamed-chunk-3-1.png │ │ │ ├── unnamed-chunk-5-1.png │ │ │ ├── unnamed-chunk-7-1.png │ │ │ └── unnamed-chunk-9-1.png │ ├── exploring-the-phenotypic-data │ │ ├── .gitignore │ │ └── figure │ │ │ ├── families.png │ │ │ ├── gender.png │ │ │ ├── samples.png │ │ │ ├── superpop.png │ │ │ ├── ethnicity.png │ │ │ └── ethnicity and gender.png │ ├── reproducing-hardy-weinberg-equilibrium │ │ ├── .gitignore │ │ └── README.Rmd │ ├── reproducing-vcfstats │ │ └── vcfstats-output │ │ │ ├── stats.qual-tstv │ │ │ ├── stats.tstv │ │ │ ├── stats.legend │ │ │ ├── stats.shared │ │ │ └── stats.private │ ├── reproducing-allelic-frequencies │ │ ├── figure │ │ │ ├── maf.png │ │ │ ├── all variants.png │ │ │ ├── viz maf no X.png │ │ │ └── common variants by gender.png │ │ └── README.Rmd │ └── README.md ├── figure │ ├── dbSNP Variants-1.png │ ├── shared Variants-1.png │ ├── shared variants by pop-1.png │ ├── shared common variants by pop-1.png │ ├── shared rare variants by pop-1.png │ ├── shared rare variants by percent pop-1.png │ └── shared common variants by percent pop-1.png ├── sql │ ├── variant-counts-by-type.sql │ ├── phenotype_sql │ │ ├── num-samples.sql │ │ ├── family-sizes.sql │ │ ├── gender-ratio.sql │ │ ├── ethnicity-by-gender-ratio.sql │ │ ├── ethnicity-by-superpop-ratio.sql │ │ └── ethnicity-ratio.sql │ ├── reproducing-vcfstats │ │ ├── variant-count-brca1.sql │ │ ├── variant-counts-by-type-brca1.sql │ │ ├── snp-variant-counts-brca1.sql │ │ ├── sample-snp-counts-brca1.sql │ │ ├── sample-indel-counts-brca1.sql │ │ ├── shared-variant-counts-brca1.sql │ │ ├── indel-length-counts-brca1.sql │ │ ├── private-variant-counts-brca1.sql │ │ ├── variant-sample-counts-brca1.sql │ │ └── ti-tv-ratio-brca1.sql │ ├── variant-counts-by-type-and-chromosome.sql │ ├── snp-variant-counts.sql │ ├── understanding-alternate-alleles │ │ ├── chrom-pos-ref-dups.sql │ │ ├── minimal-unique-key.sql │ │ ├── three-chrom-pos-ref-dups.sql │ │ ├── count-chrom-pos-ref.sql │ │ ├── unique-key.sql │ │ ├── not-quite-unique-key.sql │ │ ├── sample-likelihood.sql │ │ ├── count-by-var-type-chrom-pos-ref-dups.sql │ │ ├── count-by-var-type-chrom-pos-ref-singles.sql │ │ └── sample-chrom-pos-ref-dups.sql │ ├── ratio-of-variants-by-type.sql │ ├── variant-level-data-for-brca1.sql │ ├── ratio-of-dbsnp-variants-by-chromosome.sql │ ├── indel-length-counts.sql │ ├── private-variant-counts.sql │ ├── shared-variant-counts.sql │ ├── sample-variant-counts-by-type-and-chromosome.sql │ ├── sample-level-data-for-brca1.sql │ ├── ti-tv-ratio.sql │ ├── heterozygous-homozygous-ratio.sql │ ├── minimum-allelic-frequency-by-ethnicity.sql │ ├── variant-hotspots.sql │ ├── sample-variant-hotspots.sql │ ├── minimum-allelic-frequency-by-ethnicity-no-sex-chromosomes.sql │ ├── allelic-frequency.sql │ ├── reproducing-allelic-frequencies │ │ ├── reproducing-allelic-frequency.sql │ │ └── reproducing-allelic-frequency-by-ethnicity.sql │ ├── allelic-frequency-by-gender.sql │ ├── allelic-frequency-by-ethnicity.sql │ ├── gender-het-hom-ratio.sql │ ├── shared-variant-counts-by-ethnicity.sql │ ├── gwas-pattern-two-proportion-z-test.sql │ ├── gwas-pattern-chi-squared-test.sql │ └── hardy-weinberg-equilibrium.sql └── provenance │ └── README.md ├── .gitignore ├── platinumGenomes ├── figure │ └── function-1.png ├── sql │ ├── sample-snps-by-exonic-function.sql │ ├── cohort-rare-pathenogenic-snps.sql │ └── sample-rare-pathenogenic-snps.sql └── README.Rmd ├── 1000genomes_phase3 ├── figure │ ├── titv_metrics-1.png │ ├── titv_metrics-2.png │ ├── hethom_metrics-1.png │ ├── hethom_metrics-2.png │ ├── indel_metrics-1.png │ └── indel_metrics-2.png ├── README.Rmd └── sql │ └── qc-metrics.sql ├── annotations └── README.md ├── README.md ├── CONTRIBUTING.rst └── sgdp └── provenance └── wrangle-simons-sample-attributes.R /pgp/.gitignore: -------------------------------------------------------------------------------- 1 | .httr-oauth 2 | -------------------------------------------------------------------------------- /1000genomes/.gitignore: -------------------------------------------------------------------------------- 1 | .httr-oauth 2 | -------------------------------------------------------------------------------- /pgp/data-stories/schema-comparisons/.gitignore: -------------------------------------------------------------------------------- 1 | .httr-oauth 2 | -------------------------------------------------------------------------------- /pgp/data-stories/comparing-pgp-to-1000genomes/.gitignore: -------------------------------------------------------------------------------- 1 | .httr-oauth 2 | -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-variant-data/.gitignore: -------------------------------------------------------------------------------- 1 | .httr-oauth 2 | -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-phenotypic-data/.gitignore: -------------------------------------------------------------------------------- 1 | .httr-oauth 2 | -------------------------------------------------------------------------------- /pgp/data-stories/issues-with-the-variant-centric-approach/.gitignore: -------------------------------------------------------------------------------- 1 | .httr-oauth 2 | -------------------------------------------------------------------------------- /1000genomes/data-stories/reproducing-hardy-weinberg-equilibrium/.gitignore: -------------------------------------------------------------------------------- 1 | .httr-oauth 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | cache 2 | *.Rproj 3 | .Rproj.user 4 | .Rhistory 5 | .RData 6 | *html 7 | .httr-oauth 8 | -------------------------------------------------------------------------------- /pgp/figure/gender-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/figure/gender-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/reproducing-vcfstats/vcfstats-output/stats.qual-tstv: -------------------------------------------------------------------------------- 1 | #Quality Marginal count Marginal Ts/Tv 2 | -------------------------------------------------------------------------------- /1000genomes/data-stories/reproducing-vcfstats/vcfstats-output/stats.tstv: -------------------------------------------------------------------------------- 1 | #Transitions Transversions ts/tv Sample 2 | 615 228 2.70 all 3 | -------------------------------------------------------------------------------- /1000genomes/figure/dbSNP Variants-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/figure/dbSNP Variants-1.png -------------------------------------------------------------------------------- /platinumGenomes/figure/function-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/platinumGenomes/figure/function-1.png -------------------------------------------------------------------------------- /1000genomes/figure/shared Variants-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/figure/shared Variants-1.png -------------------------------------------------------------------------------- /1000genomes_phase3/figure/titv_metrics-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes_phase3/figure/titv_metrics-1.png -------------------------------------------------------------------------------- /1000genomes_phase3/figure/titv_metrics-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes_phase3/figure/titv_metrics-2.png -------------------------------------------------------------------------------- /1000genomes_phase3/figure/hethom_metrics-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes_phase3/figure/hethom_metrics-1.png -------------------------------------------------------------------------------- /1000genomes_phase3/figure/hethom_metrics-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes_phase3/figure/hethom_metrics-2.png -------------------------------------------------------------------------------- /1000genomes_phase3/figure/indel_metrics-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes_phase3/figure/indel_metrics-1.png -------------------------------------------------------------------------------- /1000genomes_phase3/figure/indel_metrics-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes_phase3/figure/indel_metrics-2.png -------------------------------------------------------------------------------- /1000genomes/figure/shared variants by pop-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/figure/shared variants by pop-1.png -------------------------------------------------------------------------------- /1000genomes/figure/shared common variants by pop-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/figure/shared common variants by pop-1.png -------------------------------------------------------------------------------- /1000genomes/figure/shared rare variants by pop-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/figure/shared rare variants by pop-1.png -------------------------------------------------------------------------------- /pgp/data-stories/schema-comparisons/figure/call_cnt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/schema-comparisons/figure/call_cnt.png -------------------------------------------------------------------------------- /pgp/data-stories/schema-comparisons/figure/variant_cnt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/schema-comparisons/figure/variant_cnt.png -------------------------------------------------------------------------------- /1000genomes/figure/shared rare variants by percent pop-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/figure/shared rare variants by percent pop-1.png -------------------------------------------------------------------------------- /1000genomes/figure/shared common variants by percent pop-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/figure/shared common variants by percent pop-1.png -------------------------------------------------------------------------------- /pgp/data-stories/schema-comparisons/figure/sample_call_cnt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/schema-comparisons/figure/sample_call_cnt.png -------------------------------------------------------------------------------- /pgp/data-stories/schema-comparisons/figure/sample_variant_cnt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/schema-comparisons/figure/sample_variant_cnt.png -------------------------------------------------------------------------------- /1000genomes/data-stories/reproducing-allelic-frequencies/figure/maf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/reproducing-allelic-frequencies/figure/maf.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-phenotypic-data/figure/families.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-phenotypic-data/figure/families.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-phenotypic-data/figure/gender.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-phenotypic-data/figure/gender.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-phenotypic-data/figure/samples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-phenotypic-data/figure/samples.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-phenotypic-data/figure/superpop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-phenotypic-data/figure/superpop.png -------------------------------------------------------------------------------- /pgp/data-stories/comparing-pgp-to-1000genomes/figure/variant counts-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/comparing-pgp-to-1000genomes/figure/variant counts-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-phenotypic-data/figure/ethnicity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-phenotypic-data/figure/ethnicity.png -------------------------------------------------------------------------------- /pgp/data-stories/comparing-pgp-to-1000genomes/figure/genotype heatmap-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/comparing-pgp-to-1000genomes/figure/genotype heatmap-1.png -------------------------------------------------------------------------------- /pgp/data-stories/comparing-pgp-to-1000genomes/figure/variant type counts-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/comparing-pgp-to-1000genomes/figure/variant type counts-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-11-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-11-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-13-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-13-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-15-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-15-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-16-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-16-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-7-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-9-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/reproducing-allelic-frequencies/figure/all variants.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/reproducing-allelic-frequencies/figure/all variants.png -------------------------------------------------------------------------------- /1000genomes/data-stories/reproducing-allelic-frequencies/figure/viz maf no X.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/reproducing-allelic-frequencies/figure/viz maf no X.png -------------------------------------------------------------------------------- /pgp/data-stories/comparing-pgp-to-1000genomes/figure/pgp variant type counts-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/comparing-pgp-to-1000genomes/figure/pgp variant type counts-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-phenotypic-data/figure/ethnicity and gender.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-phenotypic-data/figure/ethnicity and gender.png -------------------------------------------------------------------------------- /1000genomes/data-stories/reproducing-allelic-frequencies/figure/common variants by gender.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/reproducing-allelic-frequencies/figure/common variants by gender.png -------------------------------------------------------------------------------- /1000genomes/sql/variant-counts-by-type.sql: -------------------------------------------------------------------------------- 1 | # Count the number of variants across the entirety of 1,000 Genomes by variant type. 2 | SELECT 3 | vt, 4 | COUNT(vt) as cnt, 5 | FROM 6 | [genomics-public-data:1000_genomes.variants] 7 | GROUP BY 8 | vt 9 | ORDER BY 10 | vt 11 | -------------------------------------------------------------------------------- /1000genomes/sql/phenotype_sql/num-samples.sql: -------------------------------------------------------------------------------- 1 | # Count the number of samples in the phenotypic data 2 | SELECT 3 | COUNT(sample) AS all_samples, 4 | SUM(IF(In_Phase1_Integrated_Variant_Set = TRUE, 1, 0)) AS samples_in_variants_table 5 | FROM 6 | [genomics-public-data:1000_genomes.sample_info] 7 | -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-vcfstats/variant-count-brca1.sql: -------------------------------------------------------------------------------- 1 | # Count the number of variants in BRCA1 2 | SELECT 3 | count(reference_name) as num_variants, 4 | FROM 5 | [genomics-public-data:1000_genomes.variants] 6 | WHERE 7 | reference_name = '17' 8 | AND start BETWEEN 41196311 9 | AND 41277499 -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-vcfstats/variant-counts-by-type-brca1.sql: -------------------------------------------------------------------------------- 1 | # Count the number of variants by type in BRCA1. 2 | SELECT 3 | vt AS variant_type, 4 | COUNT(vt) AS num_variants_of_type, 5 | FROM 6 | [genomics-public-data:1000_genomes.variants] 7 | WHERE 8 | reference_name = '17' 9 | AND start BETWEEN 41196311 10 | AND 41277499 11 | GROUP BY 12 | variant_type -------------------------------------------------------------------------------- /pgp/sql/schema-comparisons/record-sample-counts.sql: -------------------------------------------------------------------------------- 1 | # Confirm that we are correctly expanding reference-matching blocks into our variants. 2 | SELECT 3 | MAX(num_sample_ids) as max_samples_per_record, 4 | FROM ( 5 | SELECT 6 | COUNT(call.callset_name) WITHIN RECORD AS num_sample_ids, 7 | FROM 8 | [google.com:biggene:test.pgp_gvcf_variants_expanded2] 9 | ) 10 | -------------------------------------------------------------------------------- /1000genomes/sql/variant-counts-by-type-and-chromosome.sql: -------------------------------------------------------------------------------- 1 | # Count the number of variants across the entirety of 1,000 Genomes by variant type and 2 | # chromosome. 3 | SELECT 4 | reference_name, 5 | vt, 6 | COUNT(vt) AS cnt, 7 | FROM 8 | [genomics-public-data:1000_genomes.variants] 9 | GROUP BY 10 | reference_name, 11 | vt 12 | ORDER BY 13 | reference_name, 14 | vt 15 | -------------------------------------------------------------------------------- /1000genomes/sql/snp-variant-counts.sql: -------------------------------------------------------------------------------- 1 | # Count SNPs by base pair transition across the dataset 2 | SELECT 3 | reference_bases, 4 | alternate_bases AS allele, 5 | COUNT(alternate_bases) AS num_snps 6 | FROM 7 | [genomics-public-data:1000_genomes.variants] 8 | WHERE 9 | vt ='SNP' 10 | GROUP BY 11 | reference_bases, 12 | allele 13 | ORDER BY 14 | reference_bases, 15 | allele 16 | -------------------------------------------------------------------------------- /pgp/sql/comparing-pgp-to-1000genomes/genotype-counts.sql: -------------------------------------------------------------------------------- 1 | # Count the number of genotypes for all individuals in the dataset. 2 | SELECT 3 | genotype, 4 | COUNT(genotype) AS cnt, 5 | FROM ( 6 | SELECT 7 | GROUP_CONCAT(STRING(call.genotype)) WITHIN call AS genotype, 8 | FROM 9 | [google.com:biggene:pgp_20150205.genome_calls]) 10 | GROUP BY 11 | genotype 12 | ORDER BY 13 | cnt DESC 14 | -------------------------------------------------------------------------------- /1000genomes/data-stories/reproducing-vcfstats/vcfstats-output/stats.legend: -------------------------------------------------------------------------------- 1 | 2 | count 3 | Number of positions with known genotype 4 | 5 | nalt_X 6 | Number of monoallelic (X=0), biallelic (X=1), etc. sites 7 | 8 | ref, ref_count 9 | Number of sites containing reference allele 10 | 11 | shared 12 | Number of sites having a non-reference allele in 0,1,2,etc samples 13 | 14 | snp_count 15 | Number of positions with SNPs 16 | -------------------------------------------------------------------------------- /1000genomes/sql/phenotype_sql/family-sizes.sql: -------------------------------------------------------------------------------- 1 | # Compute the distribution of family sizes 2 | SELECT 3 | num_family_members AS family_size, 4 | COUNT(num_family_members) AS num_families_of_size 5 | FROM ( 6 | SELECT 7 | family_id, 8 | COUNT(family_id) AS num_family_members, 9 | FROM 10 | [genomics-public-data:1000_genomes.sample_info] 11 | WHERE 12 | In_Phase1_Integrated_Variant_Set = TRUE 13 | GROUP BY 14 | family_id) 15 | GROUP BY 16 | family_size -------------------------------------------------------------------------------- /1000genomes/sql/phenotype_sql/gender-ratio.sql: -------------------------------------------------------------------------------- 1 | # Compute sample count and ratio by gender 2 | SELECT 3 | gender, 4 | gender_count, 5 | RATIO_TO_REPORT(gender_count) 6 | OVER 7 | ( 8 | ORDER BY 9 | gender_count) AS gender_ratio 10 | FROM ( 11 | SELECT 12 | gender, 13 | COUNT(gender) AS gender_count, 14 | FROM 15 | [genomics-public-data:1000_genomes.sample_info] 16 | WHERE 17 | In_Phase1_Integrated_Variant_Set = TRUE 18 | GROUP BY 19 | gender) -------------------------------------------------------------------------------- /pgp/sql/cgi_variants/klotho.sql: -------------------------------------------------------------------------------- 1 | # Sample level data for Klotho variant rs9536314 for use in the "amazing 2 | # intelligence of PGP participants" data story. 3 | SELECT 4 | sample_id, 5 | chromosome, 6 | locusBegin, 7 | locusEnd, 8 | reference, 9 | allele1Seq, 10 | allele2Seq, 11 | FROM 12 | [google.com:biggene:pgp.cgi_variants] 13 | WHERE 14 | chromosome = "chr13" 15 | AND locusBegin <= 33628137 16 | AND locusEnd >= 33628138 17 | ORDER BY 18 | sample_id 19 | -------------------------------------------------------------------------------- /pgp/sql/gender-count.sql: -------------------------------------------------------------------------------- 1 | # Compute sample count by gender 2 | SELECT 3 | Sex_Gender, 4 | COUNT(1) AS cnt 5 | FROM 6 | ( 7 | SELECT 8 | call.callset_name, 9 | Sex_Gender 10 | FROM 11 | FLATTEN([google.com:biggene:pgp.variants], 12 | call) AS var 13 | JOIN 14 | [google.com:biggene:pgp.phenotypes] AS pheno 15 | ON 16 | pheno.Participant = var.call.callset_name 17 | GROUP BY 18 | call.callset_name, 19 | Sex_Gender) 20 | GROUP BY 21 | Sex_Gender -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-vcfstats/snp-variant-counts-brca1.sql: -------------------------------------------------------------------------------- 1 | # Count SNPs by base pair transition across BRCA1. 2 | SELECT 3 | reference_bases, 4 | alternate_bases AS allele, 5 | COUNT(alternate_bases) AS num_snps 6 | FROM 7 | [genomics-public-data:1000_genomes.variants] 8 | WHERE 9 | reference_name = '17' 10 | AND start BETWEEN 41196311 11 | AND 41277499 12 | AND vt ='SNP' 13 | GROUP BY 14 | reference_bases, 15 | allele 16 | ORDER BY 17 | reference_bases, 18 | allele -------------------------------------------------------------------------------- /1000genomes/sql/understanding-alternate-alleles/chrom-pos-ref-dups.sql: -------------------------------------------------------------------------------- 1 | # Find variants on chromosome 17 that reside on the same start with the same reference base 2 | SELECT 3 | reference_name, 4 | start, 5 | reference_bases, 6 | COUNT(start) AS num_alternates 7 | FROM 8 | [genomics-public-data:1000_genomes.variants] 9 | WHERE 10 | reference_name = '17' 11 | GROUP BY 12 | reference_name, 13 | start, 14 | reference_bases 15 | HAVING 16 | num_alternates > 1 17 | ORDER BY 18 | reference_name, 19 | start, 20 | reference_bases -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-vcfstats/sample-snp-counts-brca1.sql: -------------------------------------------------------------------------------- 1 | # Sample SNP counts for BRCA1. 2 | SELECT 3 | COUNT(sample_id) AS variant_count, 4 | sample_id 5 | FROM ( 6 | SELECT 7 | reference_name, 8 | start, 9 | reference_bases, 10 | call.call_set_name AS sample_id 11 | FROM 12 | [genomics-public-data:1000_genomes.variants] 13 | WHERE 14 | reference_name = '17' 15 | AND start BETWEEN 41196311 16 | AND 41277499 17 | AND vt ='SNP' 18 | AND (0 < call.genotype) 19 | ) 20 | GROUP BY 21 | sample_id 22 | ORDER BY 23 | sample_id 24 | -------------------------------------------------------------------------------- /1000genomes/sql/understanding-alternate-alleles/minimal-unique-key.sql: -------------------------------------------------------------------------------- 1 | # This query demonstrates the minimal set of fields needed to 2 | # comprise a unique key for the rows in the table. 3 | SELECT 4 | reference_name, 5 | start, 6 | alt, 7 | end, 8 | COUNT(1) AS cnt 9 | FROM ( 10 | SELECT 11 | reference_name, 12 | start, 13 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 14 | end, 15 | FROM 16 | [genomics-public-data:1000_genomes.variants]) 17 | GROUP EACH BY 18 | reference_name, 19 | start, 20 | alt, 21 | end 22 | HAVING 23 | cnt > 1 -------------------------------------------------------------------------------- /1000genomes/sql/understanding-alternate-alleles/three-chrom-pos-ref-dups.sql: -------------------------------------------------------------------------------- 1 | # Get three particular start on chromosome 17 that have alternate variants. 2 | SELECT 3 | reference_name, 4 | start, 5 | reference_bases, 6 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 7 | GROUP_CONCAT(names) WITHIN RECORD AS names, 8 | vt, 9 | FROM 10 | [genomics-public-data:1000_genomes.variants] 11 | WHERE 12 | reference_name = '17' 13 | AND (start = 48515942 14 | OR start = 48570613 15 | OR start = 48659342) 16 | ORDER BY 17 | start, 18 | reference_bases, 19 | alt 20 | -------------------------------------------------------------------------------- /pgp/sql/issues-with-the-variant-centric-approach/klotho-summary.sql: -------------------------------------------------------------------------------- 1 | # Sample counts for Klotho variant rs9536314 for use in the "amazing 2 | # intelligence of PGP participants" data story. 3 | SELECT 4 | COUNT(sample_id) AS sample_counts, 5 | chromosome, 6 | reference, 7 | allele1Seq, 8 | allele2Seq, 9 | FROM 10 | [google.com:biggene:pgp.cgi_variants] 11 | WHERE 12 | chromosome = "chr13" 13 | AND locusBegin <= 33628137 14 | AND locusEnd >= 33628138 15 | GROUP BY 16 | chromosome, 17 | reference, 18 | allele1Seq, 19 | allele2Seq 20 | ORDER BY 21 | sample_counts DESC 22 | -------------------------------------------------------------------------------- /1000genomes/sql/understanding-alternate-alleles/count-chrom-pos-ref.sql: -------------------------------------------------------------------------------- 1 | # Count number of alternate variants on chromosome 17 for the same start and 2 | # reference base 3 | SELECT 4 | num_alternates, 5 | COUNT(num_alternates) AS num_records 6 | FROM ( 7 | SELECT 8 | reference_name, 9 | start, 10 | reference_bases, 11 | COUNT(start) AS num_alternates, 12 | FROM 13 | [genomics-public-data:1000_genomes.variants] 14 | WHERE 15 | reference_name = '17' 16 | GROUP BY 17 | reference_name, 18 | start, 19 | reference_bases) 20 | GROUP BY 21 | num_alternates 22 | -------------------------------------------------------------------------------- /1000genomes/sql/ratio-of-variants-by-type.sql: -------------------------------------------------------------------------------- 1 | # Compute the ratios of variants by type for each chromosome. 2 | SELECT 3 | reference_name, 4 | vt AS variant_type, 5 | RATIO_TO_REPORT(variant_count) 6 | OVER 7 | ( 8 | PARTITION BY 9 | reference_name 10 | ORDER BY 11 | variant_count DESC) ratio_of_variants_of_type_for_reference_name, 12 | FROM ( 13 | SELECT 14 | reference_name, 15 | vt, 16 | COUNT(vt) AS variant_count 17 | FROM 18 | [genomics-public-data:1000_genomes.variants] 19 | GROUP BY 20 | reference_name, 21 | vt 22 | ORDER BY 23 | reference_name, 24 | vt) 25 | -------------------------------------------------------------------------------- /1000genomes/sql/phenotype_sql/ethnicity-by-gender-ratio.sql: -------------------------------------------------------------------------------- 1 | # Ratios of ethnicities grouped by gender 2 | SELECT 3 | population, 4 | gender, 5 | population_count, 6 | RATIO_TO_REPORT(population_count) OVER( 7 | PARTITION BY 8 | population 9 | ORDER BY 10 | gender) 11 | AS population_ratio 12 | from( 13 | SELECT 14 | gender, 15 | population, 16 | COUNT(population) AS population_count, 17 | FROM 18 | [genomics-public-data:1000_genomes.sample_info] 19 | WHERE 20 | In_Phase1_Integrated_Variant_Set = TRUE 21 | GROUP BY 22 | gender, 23 | population) 24 | ORDER BY 25 | population, 26 | gender 27 | -------------------------------------------------------------------------------- /1000genomes/sql/variant-level-data-for-brca1.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | -- 3 | -- Retrieve variant-level information for BRCA1 variants. 4 | -- 5 | SELECT 6 | reference_name, 7 | start, 8 | `end`, 9 | reference_bases, 10 | ARRAY_TO_STRING(v.alternate_bases, ',') AS alts, 11 | quality, 12 | ARRAY_TO_STRING(v.filter, ',') AS filter, 13 | ARRAY_TO_STRING(v.names, ',') AS names, 14 | vt, 15 | ARRAY_LENGTH(v.call) AS num_samples 16 | FROM 17 | `genomics-public-data.1000_genomes.variants` v 18 | WHERE 19 | reference_name IN ('17', 'chr17') 20 | AND start BETWEEN 41196311 AND 41277499 # per GRCh37 21 | ORDER BY 22 | start, 23 | alts 24 | -------------------------------------------------------------------------------- /pgp/sql/comparing-pgp-to-1000genomes/sample-counts-minmax-by-chromosome.sql: -------------------------------------------------------------------------------- 1 | # Summarize the minimum and maximum number of samples per variant by chromosome. 2 | SELECT 3 | reference_name, 4 | MIN(sample_count) AS minimum_sample_count, 5 | MAX(sample_count) AS maximum_sample_count, 6 | FROM ( 7 | SELECT 8 | reference_name, 9 | COUNT(call.call_set_name) WITHIN RECORD AS sample_count 10 | FROM 11 | [google.com:biggene:pgp_20150205.genome_calls] 12 | # The source data was Complete Genomics which includes non-variant segments. 13 | OMIT RECORD IF EVERY(alternate_bases IS NULL)) 14 | GROUP BY 15 | reference_name 16 | ORDER BY 17 | reference_name 18 | -------------------------------------------------------------------------------- /1000genomes/sql/phenotype_sql/ethnicity-by-superpop-ratio.sql: -------------------------------------------------------------------------------- 1 | # Ratios of ethnicities grouped by super population 2 | SELECT 3 | super_population, 4 | super_population_description, 5 | super_population_count, 6 | RATIO_TO_REPORT(super_population_count) 7 | OVER 8 | ( 9 | ORDER BY 10 | super_population_count) AS super_population_ratio 11 | from( 12 | SELECT 13 | super_population, 14 | super_population_description, 15 | COUNT(population) AS super_population_count, 16 | FROM 17 | [genomics-public-data:1000_genomes.sample_info] 18 | WHERE 19 | In_Phase1_Integrated_Variant_Set = TRUE 20 | GROUP BY 21 | super_population, 22 | super_population_description) -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-vcfstats/sample-indel-counts-brca1.sql: -------------------------------------------------------------------------------- 1 | # Sample INDEL counts for BRCA1. 2 | SELECT 3 | COUNT(sample_id) AS variant_count, 4 | sample_id, 5 | FROM ( 6 | SELECT 7 | call.call_set_name AS sample_id, 8 | NTH(1, 9 | call.genotype) WITHIN call AS first_allele, 10 | NTH(2, 11 | call.genotype) WITHIN call AS second_allele, 12 | FROM 13 | [genomics-public-data:1000_genomes.variants] 14 | WHERE 15 | reference_name = '17' 16 | AND start BETWEEN 41196311 17 | AND 41277499 18 | AND vt ='INDEL' 19 | HAVING 20 | 0 < first_allele 21 | OR 0 < second_allele) 22 | GROUP BY 23 | sample_id 24 | ORDER BY 25 | sample_id 26 | -------------------------------------------------------------------------------- /1000genomes/sql/understanding-alternate-alleles/unique-key.sql: -------------------------------------------------------------------------------- 1 | # This query demonstrates that an additional field, 'end', is needed to 2 | # comprise a unique key for the rows in the table. 3 | SELECT 4 | reference_name, 5 | start, 6 | reference_bases, 7 | alt, 8 | vt, 9 | end, 10 | COUNT(1) AS cnt 11 | FROM ( 12 | SELECT 13 | reference_name, 14 | start, 15 | reference_bases, 16 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 17 | vt, 18 | end, 19 | FROM 20 | [genomics-public-data:1000_genomes.variants]) 21 | GROUP EACH BY 22 | reference_name, 23 | start, 24 | reference_bases, 25 | alt, 26 | vt, 27 | end 28 | HAVING 29 | cnt > 1 -------------------------------------------------------------------------------- /1000genomes/sql/understanding-alternate-alleles/not-quite-unique-key.sql: -------------------------------------------------------------------------------- 1 | # This query demonstrates that some additional field is needed to 2 | # comprise a unique key for the rows in the table. 3 | SELECT 4 | reference_name, 5 | start, 6 | reference_bases, 7 | alt, 8 | vt, 9 | COUNT(1) AS cnt 10 | FROM ( 11 | SELECT 12 | reference_name, 13 | start, 14 | reference_bases, 15 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 16 | vt, 17 | FROM 18 | [genomics-public-data:1000_genomes.variants]) 19 | GROUP EACH BY 20 | reference_name, 21 | start, 22 | reference_bases, 23 | alt, 24 | vt 25 | HAVING 26 | cnt > 1 27 | ORDER BY 28 | reference_name 29 | -------------------------------------------------------------------------------- /1000genomes/sql/ratio-of-dbsnp-variants-by-chromosome.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | -- 3 | -- Get the proportion of variants (per chromosome) in the dataset 4 | -- that have been reported in the dbSNP database (version 132). 5 | -- 6 | WITH 7 | counts AS ( 8 | SELECT 9 | reference_name, 10 | COUNT(1) AS num_variants, 11 | COUNTIF(ARRAY_LENGTH(names) > 0) AS num_dbsnp_variants 12 | FROM 13 | `genomics-public-data.1000_genomes.variants` 14 | GROUP BY 15 | reference_name ) 16 | -- 17 | -- Compute the ratio. 18 | SELECT 19 | reference_name, 20 | num_dbsnp_variants, 21 | num_variants, 22 | num_dbsnp_variants / num_variants AS frequency 23 | FROM 24 | counts 25 | ORDER BY 26 | num_variants DESC 27 | -------------------------------------------------------------------------------- /1000genomes/sql/indel-length-counts.sql: -------------------------------------------------------------------------------- 1 | # Count the number of INDELs differing from the reference allele by particular lengths 2 | SELECT 3 | length_difference, 4 | COUNT(length_difference) AS count_of_indels_with_length_difference, 5 | FROM ( 6 | SELECT 7 | reference_name, 8 | start, 9 | reference_bases, 10 | LENGTH(reference_bases) AS ref_length, 11 | alternate_bases AS allele, 12 | LENGTH(alternate_bases) AS allele_length, 13 | (LENGTH(alternate_bases) - LENGTH(reference_bases)) AS length_difference, 14 | FROM 15 | [genomics-public-data:1000_genomes.variants] 16 | WHERE 17 | vt ='INDEL' 18 | ) 19 | GROUP BY 20 | length_difference 21 | ORDER BY 22 | length_difference 23 | -------------------------------------------------------------------------------- /1000genomes/sql/private-variant-counts.sql: -------------------------------------------------------------------------------- 1 | # Compute the number of variants for a particular sample that are shared by 2 | # no other samples. 3 | SELECT 4 | COUNT(sample_id) AS private_variants_count, 5 | sample_id 6 | FROM 7 | ( 8 | SELECT 9 | reference_name, 10 | start, 11 | reference_bases, 12 | IF(0 < call.genotype, 13 | call.call_set_name, 14 | NULL) AS sample_id, 15 | SUM(IF(0 < call.genotype, 16 | 1, 17 | 0)) WITHIN RECORD AS num_samples_with_variant 18 | FROM 19 | [genomics-public-data:1000_genomes.variants] 20 | HAVING 21 | num_samples_with_variant = 1 22 | AND sample_id IS NOT NULL) 23 | GROUP EACH BY 24 | sample_id 25 | ORDER BY 26 | sample_id 27 | -------------------------------------------------------------------------------- /annotations/README.md: -------------------------------------------------------------------------------- 1 | Annotations 2 | ============ 3 | 4 | Tute Genomics has provided a table of annotations for hg19 SNPs. 5 | 6 | * For example queries, see the [Platinum Genomes Annotation JOINs](../platinumGenomes) 7 | data story for a few examples of how these tables can be used with variant data. 8 | * Please see [Google Genomics Public Data](http://googlegenomics.readthedocs.io/en/latest/use_cases/discover_public_data/tute_genomics_public_data.html) 9 | for more detail. 10 | 11 | A handful of other annotation databases have been loaded to BigQuery for 12 | prototyping purposes. See [provenance](./provenance) for details on the 13 | source of this data and how it may have been transformed prior to loading 14 | to BigQuery. 15 | -------------------------------------------------------------------------------- /pgp/sql/issues-with-the-variant-centric-approach/factor-v-leiden.sql: -------------------------------------------------------------------------------- 1 | # Sample level data for rs6025 and hereditary thrombophilia trait 2 | # for use in the Factor V Leiden data story. 3 | SELECT 4 | sample_id, 5 | chromosome, 6 | locusBegin, 7 | locusEnd, 8 | reference, 9 | allele1Seq, 10 | allele2Seq, 11 | zygosity, 12 | has_Hereditary_thrombophilia_includes_Factor_V_Leiden_and_Prothrombin_G20210A AS has_Hereditary_thrombophilia 13 | FROM 14 | [google.com:biggene:pgp.cgi_variants] AS var 15 | LEFT OUTER JOIN 16 | [google.com:biggene:pgp.phenotypes] AS pheno 17 | ON 18 | pheno.Participant = var.sample_id 19 | WHERE 20 | chromosome = 'chr1' 21 | AND locusBegin <= 169519048 22 | AND locusEnd >= 169519049 23 | ORDER BY 24 | sample_id 25 | -------------------------------------------------------------------------------- /1000genomes/data-stories/README.md: -------------------------------------------------------------------------------- 1 | Data Stories 2 | ========================== 3 | 4 | The following sections demonstrate some interactive exploration within the 1,000 Genomes data set. 5 | 6 | * Getting Familiar with the Data 7 | * [Exploring the sample information data](./exploring-the-phenotypic-data) 8 | * [Exploring the variant data](./exploring-the-variant-data) 9 | * [Understanding Alternate Alleles in 1,000 Genomes](./understanding-alternate-alleles) 10 | * Comparisons to Common Tools and Research Results 11 | * [Reproducing the output of vcfstats](./reproducing-vcfstats) 12 | * [Reproducing Allelic Frequencies](./reproducing-allelic-frequencies) 13 | * [Reproducing the Hardy-Weinberg Equilibrium test](./reproducing-hardy-weinberg-equilibrium) 14 | 15 | -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-vcfstats/shared-variant-counts-brca1.sql: -------------------------------------------------------------------------------- 1 | # Count the number of variants shared by none, shared by one sample, shared by 2 | # two samples, etc... in BRCA1 3 | SELECT 4 | num_samples_with_variant, 5 | COUNT(1) AS num_variants_shared_by_this_many_samples 6 | FROM ( 7 | SELECT 8 | reference_name, 9 | start, 10 | END, 11 | reference_bases, 12 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 13 | SUM(NOT EVERY(call.genotype <= 0)) WITHIN call AS num_samples_with_variant 14 | FROM 15 | [genomics-public-data:1000_genomes.variants] 16 | WHERE 17 | reference_name = '17' 18 | AND start BETWEEN 41196311 19 | AND 41277499 20 | ) 21 | GROUP BY 22 | num_samples_with_variant 23 | ORDER BY 24 | num_samples_with_variant -------------------------------------------------------------------------------- /1000genomes/sql/phenotype_sql/ethnicity-ratio.sql: -------------------------------------------------------------------------------- 1 | # Compute sample count and ratio by ethnicity 2 | SELECT 3 | population, 4 | population_description, 5 | population_count, 6 | RATIO_TO_REPORT(population_count) 7 | OVER 8 | ( 9 | ORDER BY 10 | population_count) AS population_ratio, 11 | super_population, 12 | super_population_description, 13 | from( 14 | SELECT 15 | population, 16 | population_description, 17 | super_population, 18 | super_population_description, 19 | COUNT(population) AS population_count, 20 | FROM 21 | [genomics-public-data:1000_genomes.sample_info] 22 | WHERE 23 | In_Phase1_Integrated_Variant_Set = TRUE 24 | GROUP BY 25 | population, 26 | population_description, 27 | super_population, 28 | super_population_description) 29 | -------------------------------------------------------------------------------- /1000genomes/sql/shared-variant-counts.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | -- 3 | -- Count the number of variants shared by none, shared by one sample, two samples, etc... 4 | -- 5 | SELECT 6 | num_samples_with_variant, 7 | COUNT(1) AS num_variants_shared_by_this_many_samples 8 | FROM ( 9 | SELECT 10 | reference_name, 11 | start, 12 | `end`, 13 | reference_bases, 14 | alternate_bases[ORDINAL(1)] AS alt, -- 1000 Genomes is biallelic. 15 | (SELECT COUNTIF(EXISTS(SELECT gt 16 | FROM UNNEST(call.genotype) gt 17 | WHERE gt >= 1)) FROM v.call) AS num_samples_with_variant 18 | FROM 19 | `genomics-public-data.1000_genomes.variants` v 20 | WHERE 21 | reference_name NOT IN ("X", "Y", "MT")) 22 | GROUP BY 23 | num_samples_with_variant 24 | ORDER BY 25 | num_samples_with_variant 26 | -------------------------------------------------------------------------------- /1000genomes/sql/understanding-alternate-alleles/sample-likelihood.sql: -------------------------------------------------------------------------------- 1 | # Get data sufficient to make a judgment upon this particular sample's call. 2 | SELECT 3 | reference_name, 4 | start, 5 | reference_bases AS ref, 6 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 7 | GROUP_CONCAT(filter) WITHIN RECORD AS filters, 8 | avgpost, 9 | rsq 10 | vt, 11 | call.call_set_name AS sample_id, 12 | call.phaseset AS phaseset, 13 | NTH(1, call.genotype) WITHIN call AS first_allele, 14 | NTH(2, call.genotype) WITHIN call AS second_allele, 15 | call.ds AS ds, 16 | GROUP_CONCAT(STRING(call.genotype_likelihood)) WITHIN call AS likelihoods, 17 | FROM 18 | [genomics-public-data:1000_genomes.variants] 19 | WHERE 20 | reference_name = '17' 21 | AND start = 48515942 22 | HAVING 23 | sample_id = 'HG00100' 24 | ORDER BY 25 | alt 26 | -------------------------------------------------------------------------------- /pgp/data-stories/README.md: -------------------------------------------------------------------------------- 1 | Data Stories 2 | ========================== 3 | 4 | The following sections demonstrate some interactive exploration within the PGP dataset. 5 | 6 | * Getting Familiar with the Data 7 | * [Comparing PGP to 1000 Genomes](./comparing-pgp-to-1000genomes) 8 | * [Issues with the Variant-Centric Approach](./issues-with-the-variant-centric-approach) 9 | * [A Comparison of Schemas and Data Encodings](./schema-comparisons) 10 | 11 | 12 | Have other data stories you would like to see here? Have any data stories you would like to *share*? Have *corrections to the biology* covered in this material? Have query *simplifications* or *speed improvements*? Let us know by [filing an issue](https://github.com/googlegenomics/bigquery-examples/issues) or [contacting us directly](mailto:google-genomics-contact@googlegroups.com). 13 | -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-vcfstats/indel-length-counts-brca1.sql: -------------------------------------------------------------------------------- 1 | # Count the number of INDELs differing from the reference allele by particular 2 | # lengths for BRCA1. 3 | SELECT 4 | length_difference, 5 | COUNT(length_difference) AS count_of_indels_with_length_difference, 6 | FROM ( 7 | SELECT 8 | reference_name, 9 | start, 10 | reference_bases, 11 | LENGTH(reference_bases) AS ref_length, 12 | alternate_bases AS allele, 13 | LENGTH(alternate_bases) AS allele_length, 14 | (LENGTH(alternate_bases) - LENGTH(reference_bases)) AS length_difference, 15 | FROM 16 | [genomics-public-data:1000_genomes.variants] 17 | WHERE 18 | reference_name = '17' 19 | AND start BETWEEN 41196311 20 | AND 41277499 21 | AND vt ='INDEL' 22 | ) 23 | GROUP BY 24 | length_difference 25 | ORDER BY 26 | length_difference -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-vcfstats/private-variant-counts-brca1.sql: -------------------------------------------------------------------------------- 1 | # Compute the number of variants within BRCA1 for a particular sample that are shared by 2 | # no other samples. 3 | SELECT 4 | COUNT(sample_id) AS private_variants_count, 5 | sample_id 6 | FROM 7 | ( 8 | SELECT 9 | reference_name, 10 | start, 11 | reference_bases, 12 | IF(0 < call.genotype, 13 | call.call_set_name, 14 | NULL) AS sample_id, 15 | SUM(IF(0 < call.genotype, 16 | 1, 17 | 0)) WITHIN RECORD AS num_samples_with_variant 18 | FROM 19 | [genomics-public-data:1000_genomes.variants] 20 | WHERE 21 | reference_name = '17' 22 | AND start BETWEEN 41196311 23 | AND 41277499 24 | HAVING 25 | num_samples_with_variant = 1 26 | AND sample_id IS NOT NULL) 27 | GROUP EACH BY 28 | sample_id 29 | ORDER BY 30 | sample_id 31 | -------------------------------------------------------------------------------- /1000genomes/sql/sample-variant-counts-by-type-and-chromosome.sql: -------------------------------------------------------------------------------- 1 | # Count the number of variants for each sample across the entirety of the 1,000 2 | # Genomes dataset by variant type and chromosome. 3 | SELECT 4 | reference_name, 5 | vt, 6 | sample_id, 7 | COUNT(sample_id) AS variant_count, 8 | FROM 9 | ( 10 | SELECT 11 | reference_name, 12 | vt, 13 | call.call_set_name AS sample_id, 14 | NTH(1, 15 | call.genotype) WITHIN call AS first_allele, 16 | NTH(2, 17 | call.genotype) WITHIN call AS second_allele, 18 | FROM 19 | [genomics-public-data:1000_genomes.variants] 20 | HAVING 21 | first_allele > 0 22 | OR (second_allele IS NOT NULL 23 | AND second_allele > 0)) 24 | GROUP BY 25 | sample_id, 26 | reference_name, 27 | vt 28 | ORDER BY 29 | reference_name, 30 | vt, 31 | variant_count, 32 | sample_id 33 | -------------------------------------------------------------------------------- /pgp/sql/issues-with-the-variant-centric-approach/factor-v-leiden-summary.sql: -------------------------------------------------------------------------------- 1 | # Summary data for rs6025 and hereditary thrombophilia trait 2 | # for use in the Factor V Leiden data story. 3 | SELECT 4 | COUNT(sample_id) AS sample_counts, 5 | chromosome, 6 | reference, 7 | allele1Seq, 8 | allele2Seq, 9 | has_Hereditary_thrombophilia_includes_Factor_V_Leiden_and_Prothrombin_G20210A AS has_Hereditary_thrombophilia 10 | FROM 11 | [google.com:biggene:pgp.cgi_variants] AS var 12 | LEFT OUTER JOIN 13 | [google.com:biggene:pgp.phenotypes] AS pheno 14 | ON 15 | pheno.Participant = var.sample_id 16 | WHERE 17 | chromosome = 'chr1' 18 | AND locusBegin <= 169519048 19 | AND locusEnd >= 169519049 20 | GROUP BY 21 | chromosome, 22 | reference, 23 | allele1Seq, 24 | allele2Seq, 25 | has_Hereditary_thrombophilia 26 | ORDER BY 27 | sample_counts DESC 28 | -------------------------------------------------------------------------------- /pgp/sql/gvcf_variants_expanded/klotho.sql: -------------------------------------------------------------------------------- 1 | # Sample level data for Klotho variant rs9536314 for use in the "amazing 2 | # intelligence of PGP participants" data story, specifically joining two 3 | # tables to compare the different encodings. 4 | SELECT 5 | contig_name, 6 | start_pos, 7 | end_pos, 8 | END, 9 | ref, 10 | alt, 11 | sample_id, 12 | genotype 13 | FROM 14 | FLATTEN( 15 | SELECT 16 | contig_name, 17 | start_pos, 18 | end_pos, 19 | END, 20 | reference_bases AS ref, 21 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 22 | call.callset_name AS sample_id, 23 | GROUP_CONCAT(STRING(call.genotype), 24 | '/') WITHIN call AS genotype, 25 | FROM 26 | [google.com:biggene:test.pgp_gvcf_variants_expanded2] 27 | WHERE 28 | contig_name = '13' 29 | AND start_pos == 33628138 30 | , call) 31 | ORDER BY 32 | sample_id 33 | -------------------------------------------------------------------------------- /1000genomes/sql/sample-level-data-for-brca1.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | -- 3 | -- Retrieve sample-level information for BRCA1 variants. 4 | -- 5 | SELECT 6 | reference_name, 7 | start, 8 | `end`, 9 | reference_bases, 10 | ARRAY_TO_STRING(v.alternate_bases, ',') AS alts, 11 | quality, 12 | ARRAY_TO_STRING(v.filter, ',') AS filters, 13 | vt, 14 | ARRAY_TO_STRING(v.names, ',') AS names, 15 | call.call_set_name, 16 | call.phaseset, 17 | (SELECT STRING_AGG(CAST(gt AS STRING)) from UNNEST(call.genotype) gt) AS genotype, 18 | call.ds, 19 | (SELECT STRING_AGG(CAST(lh AS STRING)) from UNNEST(call.genotype_likelihood) lh) AS likelihoods 20 | FROM 21 | `genomics-public-data.1000_genomes.variants` v, v.call call 22 | WHERE 23 | reference_name IN ('17', 'chr17') 24 | AND start BETWEEN 41196311 AND 41277499 # per GRCh37 25 | AND call_set_name = 'HG00100' 26 | ORDER BY 27 | start, 28 | alts 29 | -------------------------------------------------------------------------------- /pgp/sql/schema-comparisons/missingness-klotho.sql: -------------------------------------------------------------------------------- 1 | # Missingness rate for Klotho variant rs9536314 in the "amazing 2 | # intelligence of PGP participants" data story. 3 | SELECT 4 | COUNT(sample_id) AS num_samples_called_for_position, 5 | SUM(called_count) AS num_alleles_called_for_position, 6 | 1 - (SUM(called_count)/(172*2)) AS missingness_rate 7 | FROM ( 8 | SELECT 9 | contig_name, 10 | start_pos, 11 | end_pos, 12 | END, 13 | reference_bases, 14 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 15 | call.callset_name AS sample_id, 16 | GROUP_CONCAT(STRING(call.genotype), 17 | '/') WITHIN call AS genotype, 18 | SUM(call.genotype >= 0) WITHIN RECORD as called_count, 19 | FROM 20 | [google.com:biggene:pgp.gvcf_variants] 21 | WHERE 22 | contig_name = '13' 23 | AND start_pos <= 33628138 24 | AND (end_pos = 33628139 25 | OR END >= 33628139) 26 | ) 27 | -------------------------------------------------------------------------------- /pgp/sql/gvcf_variants/klotho.sql: -------------------------------------------------------------------------------- 1 | # Sample level data for Klotho variant rs9536314 for use in the "amazing 2 | # intelligence of PGP participants" data story, specifically joining two 3 | # tables to compare the different encodings. 4 | SELECT 5 | contig_name, 6 | start_pos, 7 | end_pos, 8 | END, 9 | ref, 10 | alt, 11 | sample_id, 12 | genotype 13 | FROM 14 | FLATTEN( 15 | SELECT 16 | contig_name, 17 | start_pos, 18 | end_pos, 19 | END, 20 | reference_bases AS ref, 21 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 22 | call.callset_name AS sample_id, 23 | GROUP_CONCAT(STRING(call.genotype), 24 | '/') WITHIN call AS genotype, 25 | FROM 26 | [google.com:biggene:pgp.gvcf_variants] 27 | WHERE 28 | contig_name = '13' 29 | AND start_pos <= 33628138 30 | AND (end_pos >= 33628139 31 | OR END >= 33628139) 32 | , 33 | call) 34 | ORDER BY 35 | sample_id 36 | -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-vcfstats/variant-sample-counts-brca1.sql: -------------------------------------------------------------------------------- 1 | # Count the number of samples that have the BRCA1 variant. 2 | SELECT 3 | reference_name, 4 | start, 5 | reference_bases, 6 | SUM(first_allele > 0 7 | OR second_allele > 0) AS num_samples_with_variant 8 | FROM( 9 | SELECT 10 | reference_name, 11 | start, 12 | reference_bases, 13 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 14 | NTH(1, 15 | call.genotype) WITHIN call AS first_allele, 16 | NTH(2, 17 | call.genotype) WITHIN call AS second_allele, 18 | FROM 19 | [genomics-public-data:1000_genomes.variants] 20 | WHERE 21 | reference_name = '17' 22 | AND start BETWEEN 41196311 23 | AND 41277499 24 | AND vt ='SNP' 25 | ) 26 | GROUP BY 27 | reference_name, 28 | start, 29 | reference_bases, 30 | alt 31 | ORDER BY 32 | num_samples_with_variant, 33 | start 34 | -------------------------------------------------------------------------------- /pgp/sql/comparing-pgp-to-1000genomes/variant-counts-by-chromosome.sql: -------------------------------------------------------------------------------- 1 | # Count the number of variants per chromosome. 2 | SELECT 3 | reference_name, 4 | cnt, 5 | dataset 6 | FROM ( 7 | SELECT 8 | reference_name, 9 | COUNT(reference_name) AS cnt, 10 | '1000Genomes' AS dataset 11 | FROM 12 | [genomics-public-data:1000_genomes.variants] 13 | GROUP BY 14 | reference_name 15 | ), 16 | ( 17 | SELECT 18 | # Normalize the reference_name to match that found in 1,000 Genomes. 19 | IF(reference_name = 'chrM', 'MT', SUBSTR(reference_name, 4)) AS reference_name, 20 | COUNT(reference_name) AS cnt, 21 | 'PGP' AS dataset 22 | FROM 23 | [google.com:biggene:pgp_20150205.genome_calls] 24 | # The source data was Complete Genomics which includes non-variant segments. 25 | OMIT RECORD IF EVERY(alternate_bases IS NULL) 26 | GROUP BY 27 | reference_name) 28 | ORDER BY 29 | reference_name, 30 | dataset 31 | -------------------------------------------------------------------------------- /1000genomes/sql/understanding-alternate-alleles/count-by-var-type-chrom-pos-ref-dups.sql: -------------------------------------------------------------------------------- 1 | # Count by variant type the number of alternate variants on chromosome 17 for the same 2 | # start and reference base 3 | SELECT 4 | vt, 5 | COUNT(vt) AS num_variant_type 6 | FROM 7 | [genomics-public-data:1000_genomes.variants] AS variants 8 | JOIN ( 9 | SELECT 10 | reference_name, 11 | start, 12 | reference_bases, 13 | COUNT(start) AS num_alternates, 14 | FROM 15 | [genomics-public-data:1000_genomes.variants] 16 | WHERE 17 | reference_name = '17' 18 | GROUP EACH BY 19 | reference_name, 20 | start, 21 | reference_bases 22 | HAVING 23 | num_alternates > 1) AS dups 24 | ON 25 | variants.reference_name = dups.reference_name 26 | AND variants.start = dups.start 27 | AND variants.reference_bases = dups.reference_bases 28 | WHERE 29 | variants.reference_name = '17' 30 | GROUP EACH BY 31 | vt 32 | ORDER BY 33 | vt 34 | -------------------------------------------------------------------------------- /pgp/sql/comparing-pgp-to-1000genomes/parsed-genotype-counts.sql: -------------------------------------------------------------------------------- 1 | # Count the number of sample genotypes, parsed into components. 2 | SELECT 3 | first_allele, 4 | second_allele, 5 | dataset, 6 | # Convert integer to float to avoid numeric overflow in R for integers. 7 | FLOAT(COUNT(1)) AS cnt 8 | FROM ( 9 | SELECT 10 | NTH(1, call.genotype) WITHIN call AS first_allele, 11 | NTH(2, call.genotype) WITHIN call AS second_allele, 12 | '1000Genomes' AS dataset 13 | FROM 14 | [genomics-public-data:1000_genomes.variants] 15 | OMIT RECORD IF reference_name IN ('X', 'Y', 'MT')), 16 | ( 17 | SELECT 18 | NTH(1, call.genotype) WITHIN call AS first_allele, 19 | NTH(2, call.genotype) WITHIN call AS second_allele, 20 | 'PGP' AS dataset 21 | FROM 22 | [google.com:biggene:pgp_20150205.genome_calls] 23 | OMIT RECORD IF reference_name IN ('chrX', 'chrY', 'chrM')) 24 | GROUP BY 25 | first_allele, 26 | second_allele, 27 | dataset 28 | -------------------------------------------------------------------------------- /1000genomes/sql/understanding-alternate-alleles/count-by-var-type-chrom-pos-ref-singles.sql: -------------------------------------------------------------------------------- 1 | # Count by variant type the number of variants on chromosome 17 unique for a 2 | # start and reference base 3 | SELECT 4 | vt, 5 | COUNT(vt) AS num_variant_type 6 | FROM 7 | [genomics-public-data:1000_genomes.variants] AS variants 8 | JOIN EACH ( 9 | SELECT 10 | reference_name, 11 | start, 12 | reference_bases, 13 | COUNT(start) AS num_alternates 14 | FROM 15 | [genomics-public-data:1000_genomes.variants] 16 | WHERE 17 | reference_name = '17' 18 | GROUP EACH BY 19 | reference_name, 20 | start, 21 | reference_bases 22 | HAVING 23 | num_alternates = 1) AS singles 24 | ON 25 | variants.reference_name = singles.reference_name 26 | AND variants.start = singles.start 27 | AND variants.reference_bases = singles.reference_bases 28 | WHERE 29 | variants.reference_name = '17' 30 | GROUP EACH BY 31 | vt 32 | ORDER BY 33 | vt 34 | -------------------------------------------------------------------------------- /1000genomes/sql/ti-tv-ratio.sql: -------------------------------------------------------------------------------- 1 | # Compute the Ti/Tv ratio of the 1,000 Genomes dataset. 2 | SELECT 3 | transitions, 4 | transversions, 5 | transitions/transversions AS titv 6 | FROM ( 7 | SELECT 8 | SUM(IF(mutation IN ('A->G', 9 | 'G->A', 10 | 'C->T', 11 | 'T->C'), 12 | INTEGER(num_snps), 13 | INTEGER(0))) AS transitions, 14 | SUM(IF(mutation IN ('A->C', 15 | 'C->A', 16 | 'G->T', 17 | 'T->G', 18 | 'A->T', 19 | 'T->A', 20 | 'C->G', 21 | 'G->C'), 22 | INTEGER(num_snps), 23 | INTEGER(0))) AS transversions, 24 | FROM ( 25 | SELECT 26 | CONCAT(reference_bases, 27 | CONCAT(STRING('->'), 28 | alternate_bases)) AS mutation, 29 | COUNT(alternate_bases) AS num_snps, 30 | FROM 31 | [genomics-public-data:1000_genomes.variants] 32 | WHERE 33 | vt = 'SNP' 34 | GROUP BY 35 | mutation 36 | ORDER BY 37 | mutation)) 38 | -------------------------------------------------------------------------------- /pgp/sql/comparing-pgp-to-1000genomes/taking-a-closer-look-at-variant-types.sql: -------------------------------------------------------------------------------- 1 | # Inner SELECT filters just the records in which we are interested. 2 | # Outer SELECT performs our analysis, in this case just a count of the genotypes 3 | # at a particular position in chromosome 3. 4 | SELECT 5 | reference_name, 6 | start, 7 | reference_bases, 8 | alternate_bases, 9 | genotype, 10 | COUNT(genotype) AS number_of_individuals, 11 | FROM ( 12 | SELECT 13 | reference_name, 14 | start, 15 | reference_bases, 16 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alternate_bases, 17 | call.callset_name, 18 | GROUP_CONCAT(STRING(call.genotype)) WITHIN call AS genotype, 19 | FROM 20 | [google.com:biggene:pgp_20150205.genome_calls] 21 | WHERE 22 | reference_name = 'chr3' 23 | AND start = 65440409) 24 | GROUP BY 25 | reference_name, 26 | start, 27 | reference_bases, 28 | alternate_bases, 29 | genotype 30 | ORDER BY 31 | alternate_bases, 32 | number_of_individuals DESC 33 | -------------------------------------------------------------------------------- /1000genomes/sql/heterozygous-homozygous-ratio.sql: -------------------------------------------------------------------------------- 1 | # Count the homozygous and heterozygous variants for each sample across the 2 | # entirety of the 1,000 Genomes dataset. 3 | SELECT 4 | sample_id, 5 | SUM(IF(0 = first_allele 6 | AND 0 = second_allele, 7 | 1, 8 | 0)) AS hom_RR_count, 9 | SUM(IF(first_allele = second_allele 10 | AND first_allele > 0, 11 | 1, 12 | 0)) AS hom_AA_count, 13 | SUM(IF((first_allele != second_allele 14 | OR second_allele IS NULL) 15 | AND (first_allele > 0 16 | OR second_allele > 0), 17 | 1, 18 | 0)) AS het_RA_count 19 | FROM ( 20 | SELECT 21 | reference_name, 22 | call.call_set_name AS sample_id, 23 | NTH(1, 24 | call.genotype) WITHIN call AS first_allele, 25 | NTH(2, 26 | call.genotype) WITHIN call AS second_allele, 27 | FROM 28 | [genomics-public-data:1000_genomes.variants] 29 | WHERE 30 | reference_name != 'Y' AND reference_name != 'M' 31 | ) 32 | GROUP BY 33 | sample_id 34 | ORDER BY 35 | sample_id 36 | -------------------------------------------------------------------------------- /1000genomes/sql/minimum-allelic-frequency-by-ethnicity.sql: -------------------------------------------------------------------------------- 1 | # Count the variation for each sample including phenotypic traits 2 | SELECT 3 | samples.call.call_set_name AS sample_id, 4 | gender, 5 | population, 6 | super_population, 7 | COUNT(samples.call.call_set_name) AS num_variants_for_sample, 8 | SUM(samples.af >= 0.05) AS common_variant, 9 | SUM(samples.af < 0.05 AND samples.af > 0.005) AS middle_variant, 10 | SUM(samples.af <= 0.005 AND samples.af > 0.001) AS rare_variant, 11 | SUM(samples.af <= 0.001) AS very_rare_variant, 12 | FROM 13 | FLATTEN(( 14 | SELECT 15 | af, 16 | vt, 17 | call.call_set_name, 18 | FROM 19 | [genomics-public-data:1000_genomes.variants] 20 | WHERE 21 | vt = 'SNP' 22 | OMIT call IF EVERY(call.genotype <= 0)), 23 | call) AS samples 24 | JOIN 25 | [genomics-public-data:1000_genomes.sample_info] p 26 | ON 27 | samples.call.call_set_name = p.sample 28 | GROUP BY 29 | sample_id, 30 | gender, 31 | population, 32 | super_population 33 | ORDER BY 34 | sample_id 35 | -------------------------------------------------------------------------------- /1000genomes/sql/variant-hotspots.sql: -------------------------------------------------------------------------------- 1 | # Summarize the variant counts by 10,000 start-wide windows in order to identify 2 | # variant hotspots within a chromosome for all samples. 3 | SELECT 4 | reference_name, 5 | window, 6 | window * 10000 AS window_start, 7 | ((window * 10000) + 9999) AS window_end, 8 | MIN(start) AS min_variant_start, 9 | MAX(start) AS max_variant_start, 10 | COUNT(sample_id) AS num_variants_in_window, 11 | FROM ( 12 | SELECT 13 | reference_name, 14 | start, 15 | INTEGER(FLOOR(start / 10000)) AS window, 16 | call.call_set_name AS sample_id, 17 | NTH(1, 18 | call.genotype) WITHIN call AS first_allele, 19 | NTH(2, 20 | call.genotype) WITHIN call AS second_allele, 21 | FROM 22 | [genomics-public-data:1000_genomes.variants] 23 | HAVING 24 | first_allele > 0 25 | OR (second_allele IS NOT NULL 26 | AND second_allele > 0)) 27 | GROUP BY 28 | reference_name, 29 | window, 30 | window_start, 31 | window_end, 32 | ORDER BY 33 | num_variants_in_window DESC, 34 | window 35 | -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-vcfstats/ti-tv-ratio-brca1.sql: -------------------------------------------------------------------------------- 1 | # Compute the Ti/Tv ratio for BRCA1. 2 | SELECT 3 | transitions, 4 | transversions, 5 | transitions/transversions AS titv 6 | FROM ( 7 | SELECT 8 | SUM(IF(mutation IN ('A->G', 9 | 'G->A', 10 | 'C->T', 11 | 'T->C'), 12 | INTEGER(num_snps), 13 | INTEGER(0))) AS transitions, 14 | SUM(IF(mutation IN ('A->C', 15 | 'C->A', 16 | 'G->T', 17 | 'T->G', 18 | 'A->T', 19 | 'T->A', 20 | 'C->G', 21 | 'G->C'), 22 | INTEGER(num_snps), 23 | INTEGER(0))) AS transversions, 24 | FROM ( 25 | SELECT 26 | CONCAT(reference_bases, 27 | CONCAT(STRING('->'), 28 | alternate_bases)) AS mutation, 29 | COUNT(alternate_bases) AS num_snps, 30 | FROM 31 | [genomics-public-data:1000_genomes.variants] 32 | WHERE 33 | reference_name = '17' 34 | AND start BETWEEN 41196311 35 | AND 41277499 36 | AND vt = 'SNP' 37 | GROUP BY 38 | mutation 39 | ORDER BY 40 | mutation)) -------------------------------------------------------------------------------- /platinumGenomes/sql/sample-snps-by-exonic-function.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | -- 3 | -- Count SNPs by functional impact for each sample in Platinum Genomes. 4 | -- 5 | WITH 6 | sample_variants AS ( 7 | SELECT 8 | REGEXP_EXTRACT(reference_name, r'chr(.+)') AS chr, 9 | start AS start, 10 | reference_bases, 11 | alt, 12 | call.call_set_name 13 | FROM 14 | `genomics-public-data.platinum_genomes.variants` v, 15 | v.call call, 16 | v.alternate_bases alt WITH OFFSET alt_offset 17 | WHERE 18 | -- Require that at least one genotype matches this alternate. 19 | EXISTS (SELECT gt FROM UNNEST(call.genotype) gt WHERE gt = alt_offset+1) 20 | ) 21 | -- 22 | -- 23 | SELECT 24 | call_set_name, 25 | ExonicFunc, 26 | COUNT(ExonicFunc) AS variant_count 27 | FROM 28 | `silver-wall-555.TuteTable.hg19` AS annots 29 | JOIN sample_variants AS vars 30 | ON 31 | vars.chr = annots.Chr 32 | AND vars.start = annots.Start 33 | AND vars.reference_bases = annots.Ref 34 | AND vars.alt = annots.Alt 35 | WHERE 36 | ExonicFunc IS NOT NULL 37 | GROUP BY 38 | call_set_name, 39 | ExonicFunc 40 | ORDER BY 41 | call_set_name, 42 | ExonicFunc 43 | -------------------------------------------------------------------------------- /1000genomes/sql/sample-variant-hotspots.sql: -------------------------------------------------------------------------------- 1 | # Summarize the variant counts for a particular sample by 10,000 start-wide windows 2 | # in order to identify variant hotspots within a chromosome for a particular sample. 3 | SELECT 4 | reference_name, 5 | window, 6 | window * 10000 AS window_start, 7 | ((window * 10000) + 9999) AS window_end, 8 | MIN(start) AS min_variant_start, 9 | MAX(start) AS max_variant_start, 10 | sample_id, 11 | COUNT(sample_id) AS num_variants_in_window, 12 | FROM ( 13 | SELECT 14 | reference_name, 15 | start, 16 | INTEGER(FLOOR(start / 10000)) AS window, 17 | call.call_set_name AS sample_id, 18 | NTH(1, 19 | call.genotype) WITHIN call AS first_allele, 20 | NTH(2, 21 | call.genotype) WITHIN call AS second_allele, 22 | FROM 23 | [genomics-public-data:1000_genomes.variants] 24 | WHERE 25 | call.call_set_name = 'HG00096' 26 | HAVING 27 | first_allele > 0 28 | OR (second_allele IS NOT NULL 29 | AND second_allele > 0)) 30 | GROUP BY 31 | reference_name, 32 | window, 33 | window_start, 34 | window_end, 35 | sample_id 36 | ORDER BY 37 | num_variants_in_window DESC, 38 | window 39 | -------------------------------------------------------------------------------- /1000genomes/sql/minimum-allelic-frequency-by-ethnicity-no-sex-chromosomes.sql: -------------------------------------------------------------------------------- 1 | # Count the variation for each sample including phenotypic traits but excluding 2 | # sex chromosomes. 3 | SELECT 4 | samples.call.call_set_name AS sample_id, 5 | gender, 6 | population, 7 | super_population, 8 | COUNT(samples.call.call_set_name) AS num_variants_for_sample, 9 | SUM(samples.af >= 0.05) AS common_variant, 10 | SUM(samples.af < 0.05 AND samples.af > 0.005) AS middle_variant, 11 | SUM(samples.af <= 0.005 AND samples.af > 0.001) AS rare_variant, 12 | SUM(samples.af <= 0.001) AS very_rare_variant, 13 | FROM 14 | FLATTEN(( 15 | SELECT 16 | af, 17 | vt, 18 | call.call_set_name, 19 | FROM 20 | [genomics-public-data:1000_genomes.variants] 21 | WHERE 22 | vt = 'SNP' 23 | AND reference_name != 'X' 24 | AND reference_name != 'Y' 25 | OMIT call IF EVERY(call.genotype <= 0)), 26 | call) AS samples 27 | JOIN 28 | [genomics-public-data:1000_genomes.sample_info] p 29 | ON 30 | samples.call.call_set_name = p.sample 31 | GROUP BY 32 | sample_id, 33 | gender, 34 | population, 35 | super_population 36 | ORDER BY 37 | sample_id 38 | -------------------------------------------------------------------------------- /pgp/sql/gvcf_variants/allelic-frequency-comparison.sql: -------------------------------------------------------------------------------- 1 | # PGP vs. 1,000 Genomes allelic frequency comparison for BRCA1 variants. 2 | SELECT 3 | contig_name, 4 | pgp.reference_bases AS reference_bases, 5 | start_pos, 6 | allele, 7 | pgp_freq, 8 | af, 9 | eur_af, 10 | afr_af, 11 | asn_af, 12 | amr_af 13 | FROM ( 14 | FLATTEN(( 15 | SELECT 16 | reference_name, 17 | start, 18 | reference_bases, 19 | alternate_bases, 20 | AF, 21 | AFR_AF, 22 | AMR_AF, 23 | ASN_AF, 24 | EUR_AF 25 | FROM 26 | [genomics-public-data:1000_genomes.variants]), 27 | alternate_bases)) AS kg 28 | JOIN 29 | EACH ( 30 | SELECT 31 | contig_name, 32 | reference_bases, 33 | start_pos, 34 | allele, 35 | freq AS pgp_freq 36 | FROM 37 | [google.com:biggene:pgp_analysis_results.gvcf_variants_allelic_frequency] 38 | ) AS pgp 39 | ON 40 | pgp.contig_name = kg.reference_name 41 | AND pgp.start_pos = kg.start 42 | AND pgp.reference_bases = kg.reference_bases 43 | AND pgp.allele = kg.alternate_bases 44 | WHERE 45 | kg.reference_name = '17' 46 | AND kg.start BETWEEN 41196312 47 | AND 41277500 48 | -------------------------------------------------------------------------------- /pgp/sql/cgi_variants/allelic-frequency-comparison.sql: -------------------------------------------------------------------------------- 1 | # PGP vs. 1,000 Genomes allelic frequency comparison for BRCA1 variants. 2 | SELECT 3 | chromosome, 4 | reference, 5 | locusBegin, 6 | locusEnd, 7 | allele, 8 | pgp_freq, 9 | af, 10 | eur_af, 11 | afr_af, 12 | asn_af, 13 | amr_af 14 | FROM ( 15 | FLATTEN(( 16 | SELECT 17 | reference_name, 18 | start, 19 | reference_bases, 20 | alternate_bases, 21 | AF, 22 | AFR_AF, 23 | AMR_AF, 24 | ASN_AF, 25 | EUR_AF 26 | FROM 27 | [genomics-public-data:1000_genomes.variants]), 28 | alternate_bases)) AS kg 29 | JOIN 30 | EACH ( 31 | SELECT 32 | chromosome, 33 | REGEXP_EXTRACT(chromosome, 34 | r'chr(\d+)') AS contig, 35 | reference, 36 | locusBegin + 1 AS position, 37 | locusBegin, 38 | locusEnd, 39 | allele, 40 | freq AS pgp_freq 41 | FROM 42 | [google.com:biggene:pgp_analysis_results.cgi_variants_allelic_frequency] 43 | ) AS pgp 44 | ON 45 | pgp.contig = kg.reference_name 46 | AND pgp.position = kg.start 47 | AND pgp.reference = kg.reference_bases 48 | AND pgp.allele = kg.alternate_bases 49 | WHERE 50 | kg.reference_name = '17' 51 | AND kg.start BETWEEN 41196312 52 | AND 41277500 53 | -------------------------------------------------------------------------------- /pgp/sql/comparing-pgp-to-1000genomes/variant-counts-by-type-and-chromosome.sql: -------------------------------------------------------------------------------- 1 | # Count the number of variants by variant type and chromosome. 2 | SELECT 3 | reference_name, 4 | vt, 5 | cnt, 6 | dataset 7 | FROM ( 8 | SELECT 9 | # Normalize the reference_name to match that found in 1,000 Genomes. 10 | IF(reference_name = 'chrM', 'MT', SUBSTR(reference_name, 4)) AS reference_name, 11 | IF(ref_len = 1 AND alt_len = 1, "SNP", "INDEL") AS vt, 12 | COUNT(reference_name) AS cnt, 13 | 'PGP' AS dataset 14 | FROM ( 15 | SELECT 16 | reference_name, 17 | svtype, 18 | LENGTH(reference_bases) AS ref_len, 19 | MAX(LENGTH(alternate_bases)) WITHIN RECORD AS alt_len, 20 | FROM 21 | [google.com:biggene:pgp_20150205.genome_calls] 22 | # The source data was Complete Genomics which includes non-variant segments. 23 | OMIT RECORD IF EVERY(alternate_bases IS NULL) 24 | ) 25 | GROUP BY 26 | reference_name, 27 | vt 28 | ), 29 | ( 30 | SELECT 31 | reference_name, 32 | IF(vt IS NULL, "not specified", vt) AS vt, 33 | COUNT(reference_name) AS cnt, 34 | '1000Genomes' AS dataset 35 | FROM 36 | [genomics-public-data:1000_genomes.variants] 37 | GROUP BY 38 | reference_name, 39 | vt 40 | ), 41 | ORDER BY 42 | reference_name, 43 | dataset, 44 | vt 45 | -------------------------------------------------------------------------------- /platinumGenomes/sql/cohort-rare-pathenogenic-snps.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | -- 3 | -- Return all SNPs from the Platinum Genomes cohort that are: 4 | -- annotated as 'pathogenic' in ClinVar 5 | -- with observed population frequency less than 1% 6 | -- 7 | WITH 8 | cohort_variants AS ( 9 | SELECT 10 | REGEXP_EXTRACT(reference_name, r'chr(.+)') AS chr, 11 | start AS start, 12 | reference_bases, 13 | alt 14 | FROM 15 | `genomics-public-data.platinum_genomes.variants` v, 16 | v.alternate_bases alt WITH OFFSET alt_offset 17 | WHERE 18 | -- Require that at least one sample in the cohort has this variant. 19 | EXISTS(SELECT gt FROM UNNEST(v.call) call, UNNEST(call.genotype) gt WHERE gt = alt_offset+1) 20 | ) 21 | -- 22 | -- 23 | SELECT 24 | annots.Chr, 25 | annots.Start, 26 | Ref, 27 | annots.Alt, 28 | Func, 29 | Gene, 30 | PopFreqMax, 31 | ExonicFunc, 32 | ClinVar_SIG, 33 | ClinVar_DIS 34 | FROM 35 | `silver-wall-555.TuteTable.hg19` AS annots 36 | JOIN 37 | cohort_variants AS vars 38 | ON 39 | vars.chr = annots.Chr 40 | AND vars.start = annots.Start 41 | AND vars.reference_bases = annots.Ref 42 | AND vars.alt = annots.Alt 43 | WHERE 44 | PopFreqMax <= 0.01 45 | AND ClinVar_SIG LIKE '%pathogenic%' 46 | AND NOT CLinVar_SIG LIKE '%non-pathogenic%' 47 | ORDER BY 48 | Chr, 49 | Start, 50 | Ref, 51 | Alt 52 | -------------------------------------------------------------------------------- /1000genomes/sql/understanding-alternate-alleles/sample-chrom-pos-ref-dups.sql: -------------------------------------------------------------------------------- 1 | # Get sample alleles for some specific variants. 2 | # TODO(deflaux): update this to a user-defined function to generalize 3 | # across more than two alternates. For more info, see 4 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377 5 | SELECT 6 | reference_name, 7 | start, 8 | alt, 9 | reference_bases, 10 | sample_id, 11 | CASE 12 | WHEN 0 = first_allele THEN reference_bases 13 | WHEN 1 = first_allele THEN alt1 14 | WHEN 2 = first_allele THEN alt2 END AS first_allele, 15 | CASE 16 | WHEN 0 = second_allele THEN reference_bases 17 | WHEN 1 = second_allele THEN alt1 18 | WHEN 2 = second_allele THEN alt2 END AS second_allele, 19 | FROM( 20 | SELECT 21 | reference_name, 22 | start, 23 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 24 | reference_bases, 25 | call.call_set_name AS sample_id, 26 | NTH(1, 27 | alternate_bases) WITHIN RECORD AS alt1, 28 | NTH(2, 29 | alternate_bases) WITHIN RECORD AS alt2, 30 | NTH(1, call.genotype) WITHIN call AS first_allele, 31 | NTH(2, call.genotype) WITHIN call AS second_allele, 32 | FROM 33 | [genomics-public-data:1000_genomes.variants] 34 | WHERE 35 | reference_name = '17' 36 | AND start = 48515942 37 | HAVING 38 | sample_id = 'HG00100' OR sample_id = 'HG00101') 39 | ORDER BY 40 | alt, 41 | sample_id 42 | -------------------------------------------------------------------------------- /platinumGenomes/sql/sample-rare-pathenogenic-snps.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | -- 3 | -- Return SNPs for sample NA12878 that are: 4 | -- annotated as 'pathogenic' in ClinVar 5 | -- with observed population frequency less than 1% 6 | -- 7 | WITH 8 | sample_variants AS ( 9 | SELECT 10 | REGEXP_EXTRACT(reference_name, r'chr(.+)') AS chr, 11 | start AS start, 12 | reference_bases, 13 | alt, 14 | call.call_set_name 15 | FROM 16 | `genomics-public-data.platinum_genomes.variants` v, 17 | v.call call, 18 | v.alternate_bases alt WITH OFFSET alt_offset 19 | WHERE 20 | call_set_name = 'NA12878' 21 | -- Require that at least one genotype matches this alternate. 22 | AND EXISTS (SELECT gt FROM UNNEST(call.genotype) gt WHERE gt = alt_offset+1) ) 23 | -- 24 | -- 25 | SELECT 26 | call_set_name, 27 | annots.Chr, 28 | annots.Start, 29 | Ref, 30 | annots.Alt, 31 | Func, 32 | Gene, 33 | PopFreqMax, 34 | ExonicFunc, 35 | ClinVar_SIG, 36 | ClinVar_DIS 37 | FROM 38 | `silver-wall-555.TuteTable.hg19` AS annots 39 | JOIN 40 | sample_variants AS vars 41 | ON 42 | vars.chr = annots.Chr 43 | AND vars.start = annots.Start 44 | AND vars.reference_bases = annots.Ref 45 | AND vars.alt = annots.Alt 46 | WHERE 47 | PopFreqMax <= 0.01 48 | AND ClinVar_SIG LIKE '%pathogenic%' 49 | AND NOT CLinVar_SIG LIKE '%non-pathogenic%' 50 | ORDER BY 51 | Chr, 52 | Start, 53 | Ref, 54 | Alt, 55 | call_set_name 56 | -------------------------------------------------------------------------------- /pgp/sql/schema-comparisons/call-counts.sql: -------------------------------------------------------------------------------- 1 | # Call counts for the PGP data encoded four different ways. 2 | SELECT 3 | chromosome, 4 | num_records, 5 | num_variants, 6 | dataset 7 | FROM 8 | ( 9 | SELECT 10 | SUBSTR(chromosome, 11 | 4) AS chromosome, 12 | COUNT(1) AS num_records, 13 | SUM(reference != '=') AS num_variants, 14 | 'cgi_variants' AS dataset 15 | FROM 16 | [google.com:biggene:pgp.cgi_variants] 17 | # Skip the genomes we were unable to convert to VCF/gVCF 18 | OMIT RECORD IF 19 | sample_id = 'huEDF7DA' OR sample_id = 'hu34D5B9' 20 | GROUP BY 21 | chromosome), 22 | ( 23 | SELECT 24 | contig_name AS chromosome, 25 | COUNT(1) AS num_records, 26 | SUM(reference_bases != 'N') AS num_variants, 27 | 'variants' AS dataset 28 | FROM 29 | [google.com:biggene:pgp.variants] 30 | GROUP BY 31 | chromosome), 32 | ( 33 | SELECT 34 | contig_name AS chromosome, 35 | COUNT(1) AS num_records, 36 | SUM(reference_bases != 'N') AS num_variants, 37 | 'gvcf_variants' AS dataset 38 | FROM 39 | [google.com:biggene:pgp.gvcf_variants] 40 | GROUP BY 41 | chromosome), 42 | ( 43 | SELECT 44 | contig_name AS chromosome, 45 | COUNT(1) AS num_records, 46 | SUM(reference_bases != 'N') AS num_variants, 47 | 'gvcf_variants_expanded' AS dataset 48 | FROM 49 | [google.com:biggene:test.pgp_gvcf_variants_expanded2] 50 | GROUP BY 51 | chromosome) 52 | ORDER BY 53 | chromosome, 54 | dataset 55 | -------------------------------------------------------------------------------- /pgp/sql/cgi_variants/allele-count.sql: -------------------------------------------------------------------------------- 1 | # Count the occurence of each variant allele across all participants in the 2 | # dataset. This returns a large result so be sure to materialize it into a 3 | # table for subsequent use. 4 | SELECT 5 | chromosome, 6 | reference, 7 | # This 'bin' can be use in subsequent interval JOINs 8 | INTEGER(FLOOR(locusBegin / 5000)) AS bin, 9 | locusBegin, 10 | locusEnd, 11 | allele, 12 | SUM(cnt) AS alternate_allele_count, 13 | FROM ( 14 | SELECT 15 | chromosome, 16 | reference, 17 | locusBegin, 18 | locusEnd, 19 | allele1Seq AS allele, 20 | COUNT(1) AS cnt 21 | FROM 22 | [google.com:biggene:pgp.cgi_variants] 23 | WHERE 24 | (reference != '=' OR reference IS NULL) 25 | AND allele1Seq != '?' 26 | AND (reference != allele1Seq OR reference IS NULL) 27 | GROUP EACH BY 28 | chromosome, 29 | reference, 30 | locusBegin, 31 | locusEnd, 32 | allele), 33 | ( 34 | SELECT 35 | chromosome, 36 | reference, 37 | locusBegin, 38 | locusEnd, 39 | allele2Seq AS allele, 40 | COUNT(1) AS cnt 41 | FROM 42 | [google.com:biggene:pgp.cgi_variants] 43 | WHERE 44 | (reference != '=' OR reference IS NULL) 45 | AND allele2Seq != '?' 46 | AND (reference != allele2Seq OR reference IS NULL) 47 | GROUP EACH BY 48 | chromosome, 49 | reference, 50 | locusBegin, 51 | locusEnd, 52 | allele) 53 | GROUP EACH BY 54 | chromosome, 55 | reference, 56 | bin, 57 | locusBegin, 58 | locusEnd, 59 | allele 60 | -------------------------------------------------------------------------------- /1000genomes/sql/allelic-frequency.sql: -------------------------------------------------------------------------------- 1 | # The following query computes the allelic frequency for BRCA1 variants in the 2 | # 1,000 Genomes dataset. 3 | SELECT 4 | reference_name, 5 | start, 6 | reference_bases, 7 | alternate_bases, 8 | alt, 9 | SUM(ref_count)+SUM(alt_count) AS num_sample_alleles, 10 | SUM(ref_count) AS ref_cnt, 11 | SUM(alt_count) AS alt_cnt, 12 | SUM(ref_count)/(SUM(ref_count)+SUM(alt_count)) AS ref_freq, 13 | SUM(alt_count)/(SUM(ref_count)+SUM(alt_count)) AS alt_freq, 14 | FROM ( 15 | SELECT 16 | reference_name, 17 | start, 18 | reference_bases, 19 | alternate_bases, 20 | alt, 21 | SUM(INTEGER(0 = call.genotype)) WITHIN RECORD AS ref_count, 22 | SUM(INTEGER(alt = call.genotype)) WITHIN RECORD AS alt_count 23 | FROM 24 | FLATTEN( 25 | FLATTEN(( 26 | SELECT 27 | reference_name, 28 | start, 29 | reference_bases, 30 | alternate_bases, 31 | POSITION(alternate_bases) AS alt, 32 | call.call_set_name, 33 | call.genotype, 34 | FROM 35 | [genomics-public-data:1000_genomes.variants] 36 | WHERE 37 | reference_name = '17' 38 | AND start BETWEEN 41196311 39 | AND 41277499 40 | AND vt='SNP' 41 | ), 42 | call), 43 | alt)) 44 | GROUP BY 45 | reference_name, 46 | start, 47 | reference_bases, 48 | alternate_bases, 49 | alt 50 | ORDER BY 51 | reference_name, 52 | start, 53 | reference_bases, 54 | alt, 55 | alternate_bases 56 | -------------------------------------------------------------------------------- /pgp/sql/cgi_variants/ti-tv-ratio.sql: -------------------------------------------------------------------------------- 1 | # Compute the Ti/Tv ratio for each participant in the PGP dataset. 2 | SELECT 3 | sample_id, 4 | transitions, 5 | transversions, 6 | transitions/transversions AS titv 7 | FROM ( 8 | SELECT 9 | sample_id, 10 | SUM(IF(mutation1 IN ('A->G', 11 | 'G->A', 12 | 'C->T', 13 | 'T->C'), 14 | 1, 15 | 0) + IF(mutation2 IN ('A->G', 16 | 'G->A', 17 | 'C->T', 18 | 'T->C'), 19 | 1, 20 | 0)) AS transitions, 21 | SUM(IF(mutation1 IN ('A->C', 22 | 'C->A', 23 | 'G->T', 24 | 'T->G', 25 | 'A->T', 26 | 'T->A', 27 | 'C->G', 28 | 'G->C'), 29 | 1, 30 | 0) + IF(mutation2 IN ('A->C', 31 | 'C->A', 32 | 'G->T', 33 | 'T->G', 34 | 'A->T', 35 | 'T->A', 36 | 'C->G', 37 | 'G->C'), 38 | 1, 39 | 0)) AS transversions, 40 | FROM ( 41 | SELECT 42 | sample_id, 43 | CONCAT(reference, 44 | CONCAT(STRING('->'), 45 | allele1Seq)) AS mutation1, 46 | CONCAT(reference, 47 | CONCAT(STRING('->'), 48 | allele2Seq)) AS mutation2, 49 | FROM 50 | [google.com:biggene:pgp.cgi_variants] 51 | WHERE 52 | # WHERE varType = 'snp' not correct since a row with both an indel 53 | # and a snp will be varType 'complex' 54 | reference != '=' 55 | AND LENGTH(reference) = 1 56 | ) 57 | GROUP BY 58 | sample_id) 59 | ORDER BY 60 | titv DESC 61 | -------------------------------------------------------------------------------- /pgp/sql/schema-comparisons/klotho-gvcf-expanded.sql: -------------------------------------------------------------------------------- 1 | # Sample level data for Klotho variant rs9536314 for use in the "amazing 2 | # intelligence of PGP participants" data story, specifically joining two 3 | # tables to compare the different encodings. 4 | SELECT 5 | cgi.sample_id, 6 | chromosome, 7 | locusBegin, 8 | locusEnd, 9 | reference, 10 | allele1Seq, 11 | allele2Seq, 12 | contig_name, 13 | start_pos, 14 | end_pos, 15 | END, 16 | ref, 17 | alt, 18 | gvcf.sample_id, 19 | genotype 20 | FROM 21 | [google.com:biggene:pgp.cgi_variants] AS cgi 22 | left OUTER JOIN 23 | ( 24 | SELECT 25 | contig_name, 26 | start_pos, 27 | end_pos, 28 | END, 29 | ref, 30 | alt, 31 | sample_id, 32 | genotype 33 | FROM 34 | FLATTEN( 35 | SELECT 36 | contig_name, 37 | start_pos, 38 | end_pos, 39 | END, 40 | reference_bases AS ref, 41 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 42 | call.callset_name AS sample_id, 43 | GROUP_CONCAT(STRING(call.genotype), 44 | '/') WITHIN call AS genotype, 45 | FROM 46 | [google.com:biggene:test.pgp_gvcf_variants_expanded2] 47 | WHERE 48 | contig_name = '13' 49 | AND start_pos == 33628138 50 | , 51 | call)) AS gvcf 52 | ON 53 | cgi.sample_id = gvcf.sample_id 54 | WHERE 55 | chromosome = "chr13" 56 | AND locusBegin <= 33628137 57 | AND locusEnd >= 33628138 58 | # Skip the genomes we were unable to convert to VCF/gVCF 59 | OMIT RECORD IF 60 | cgi.sample_id = 'huEDF7DA' OR cgi.sample_id = 'hu34D5B9' 61 | ORDER BY 62 | cgi.sample_id 63 | -------------------------------------------------------------------------------- /pgp/sql/schema-comparisons/klotho-gvcf.sql: -------------------------------------------------------------------------------- 1 | # Sample level data for Klotho variant rs9536314 for use in the "amazing 2 | # intelligence of PGP participants" data story, specifically joining two 3 | # tables to compare the different encodings. 4 | SELECT 5 | cgi.sample_id, 6 | chromosome, 7 | locusBegin, 8 | locusEnd, 9 | reference, 10 | allele1Seq, 11 | allele2Seq, 12 | contig_name, 13 | start_pos, 14 | end_pos, 15 | END, 16 | ref, 17 | alt, 18 | gvcf.sample_id, 19 | genotype 20 | FROM 21 | [google.com:biggene:pgp.cgi_variants] AS cgi 22 | left OUTER JOIN 23 | ( 24 | SELECT 25 | contig_name, 26 | start_pos, 27 | end_pos, 28 | END, 29 | ref, 30 | alt, 31 | sample_id, 32 | genotype 33 | FROM 34 | FLATTEN( 35 | SELECT 36 | contig_name, 37 | start_pos, 38 | end_pos, 39 | END, 40 | reference_bases AS ref, 41 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 42 | call.callset_name AS sample_id, 43 | GROUP_CONCAT(STRING(call.genotype), 44 | '/') WITHIN call AS genotype, 45 | FROM 46 | [google.com:biggene:pgp.gvcf_variants] 47 | WHERE 48 | contig_name = '13' 49 | AND start_pos <= 33628138 50 | AND (end_pos >= 33628139 51 | OR END >= 33628139) 52 | , 53 | call)) AS gvcf 54 | ON 55 | cgi.sample_id = gvcf.sample_id 56 | WHERE 57 | chromosome = "chr13" 58 | AND locusBegin <= 33628137 59 | AND locusEnd >= 33628138 60 | # Skip the genomes we were unable to convert to VCF/gVCF 61 | OMIT RECORD IF 62 | cgi.sample_id = 'huEDF7DA' OR cgi.sample_id = 'hu34D5B9' 63 | ORDER BY 64 | cgi.sample_id 65 | -------------------------------------------------------------------------------- /pgp/sql/gvcf_variants/allelic-frequency.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2014 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """Run allelic frequency upon one chromosome at a time, appending to the result table.""" 18 | 19 | import string, subprocess 20 | 21 | chromosomes = range(1,23) 22 | chromosomes.extend(['X', 'Y', 'M']) 23 | 24 | with open ("./allelic-frequency-chr1.sql", "r") as myfile: 25 | query=myfile.read().replace('"', '\\"') 26 | 27 | for chrom in chromosomes: 28 | q = string.replace(query, "WHERE contig_name = '1'", "WHERE contig_name = '%s'" % chrom) 29 | cmd = [ 30 | 'bq', 31 | '--project_id', 'google.com:biggene', 32 | '--nosync', 33 | 'query', 34 | '--allow_large_results', 35 | '--append_table', 36 | '--destination_table', 'pgp_analysis_results.gvcf_variants_allelic_frequency', 37 | '--batch', '"' + q + '"'] 38 | print " ".join(cmd) 39 | print subprocess.check_output(" ".join(cmd), 40 | stderr=subprocess.STDOUT, 41 | shell=True) 42 | 43 | -------------------------------------------------------------------------------- /pgp/sql/cgi_variants/allelic-frequency.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2014 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """Run allelic frequency upon one chromosome at a time, appending to the result table.""" 18 | 19 | import string, subprocess 20 | 21 | chromosomes = range(1,23) 22 | chromosomes.extend(['X', 'Y', 'M']) 23 | 24 | with open ("./allelic-frequency-chr1.sql", "r") as myfile: 25 | query=myfile.read().replace('"', '\\"') 26 | 27 | for chrom in chromosomes: 28 | q = string.replace(query, "WHERE chromosome = 'chr1'", "WHERE chromosome = 'chr%s'" % chrom) 29 | cmd = [ 30 | 'bq', 31 | '--project_id', 'google.com:biggene', 32 | '--nosync', 33 | 'query', 34 | '--allow_large_results', 35 | '--append_table', 36 | '--destination_table', 'pgp_analysis_results.cgi_variants_allelic_frequency', 37 | '--batch', '"' + q + '"'] 38 | print " ".join(cmd) 39 | print subprocess.check_output(" ".join(cmd), 40 | stderr=subprocess.STDOUT, 41 | shell=True) 42 | 43 | -------------------------------------------------------------------------------- /1000genomes/data-stories/reproducing-vcfstats/vcfstats-output/stats.shared: -------------------------------------------------------------------------------- 1 | #Shared SNPs Frequency 2 | 0 10 3 | 1 243 4 | 2 103 5 | 3 45 6 | 4 40 7 | 5 27 8 | 6 21 9 | 7 19 10 | 8 24 11 | 9 10 12 | 10 14 13 | 11 17 14 | 12 4 15 | 13 3 16 | 14 8 17 | 15 5 18 | 16 2 19 | 17 5 20 | 18 4 21 | 19 4 22 | 20 3 23 | 21 2 24 | 22 4 25 | 23 1 26 | 24 8 27 | 25 11 28 | 26 5 29 | 27 4 30 | 28 3 31 | 29 4 32 | 30 2 33 | 31 7 34 | 32 7 35 | 33 5 36 | 34 3 37 | 35 1 38 | 39 1 39 | 42 7 40 | 43 2 41 | 44 1 42 | 45 1 43 | 46 1 44 | 47 1 45 | 48 1 46 | 49 1 47 | 50 2 48 | 51 1 49 | 56 1 50 | 57 2 51 | 60 1 52 | 61 2 53 | 62 2 54 | 73 4 55 | 76 1 56 | 80 3 57 | 81 4 58 | 82 1 59 | 94 1 60 | 95 1 61 | 105 1 62 | 107 1 63 | 121 1 64 | 124 1 65 | 125 1 66 | 141 1 67 | 150 1 68 | 166 1 69 | 168 2 70 | 169 1 71 | 170 1 72 | 174 1 73 | 176 1 74 | 195 1 75 | 197 1 76 | 198 2 77 | 200 2 78 | 207 1 79 | 218 1 80 | 222 1 81 | 252 1 82 | 259 1 83 | 268 1 84 | 269 1 85 | 314 1 86 | 409 1 87 | 415 1 88 | 427 1 89 | 431 1 90 | 464 1 91 | 481 1 92 | 492 1 93 | 498 1 94 | 522 6 95 | 523 1 96 | 527 1 97 | 528 1 98 | 532 2 99 | 534 1 100 | 535 2 101 | 536 6 102 | 537 5 103 | 538 7 104 | 539 3 105 | 540 2 106 | 541 1 107 | 549 1 108 | 555 1 109 | 557 1 110 | 561 1 111 | 562 7 112 | 563 5 113 | 566 2 114 | 569 1 115 | 570 2 116 | 573 1 117 | 574 1 118 | 576 1 119 | 577 2 120 | 578 13 121 | 579 8 122 | 580 4 123 | 581 2 124 | 582 6 125 | 584 1 126 | 611 1 127 | 613 1 128 | 685 1 129 | 698 1 130 | 703 1 131 | 704 1 132 | 718 1 133 | 720 4 134 | 721 1 135 | 728 2 136 | 738 2 137 | 740 1 138 | 741 1 139 | 742 1 140 | 743 2 141 | 745 1 142 | 783 1 143 | 1088 1 144 | 1091 1 145 | -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-allelic-frequencies/reproducing-allelic-frequency.sql: -------------------------------------------------------------------------------- 1 | # The following query computes the allelic frequency for BRCA1 variants in the 2 | # 1,000 Genomes dataset and also includes the pre-computed value from the dataset. 3 | SELECT 4 | reference_name, 5 | start, 6 | reference_bases, 7 | alternate_bases, 8 | SUM(ref_count)+SUM(alt_count) AS num_sample_alleles, 9 | SUM(ref_count) AS ref_cnt, 10 | SUM(alt_count) AS alt_cnt, 11 | SUM(ref_count)/(SUM(ref_count)+SUM(alt_count)) AS ref_freq, 12 | SUM(alt_count)/(SUM(ref_count)+SUM(alt_count)) AS alt_freq, 13 | alt_freq_from_1KG 14 | FROM ( 15 | SELECT 16 | reference_name, 17 | start, 18 | reference_bases, 19 | alternate_bases, 20 | alt, 21 | SUM(INTEGER(0 = call.genotype)) WITHIN RECORD AS ref_count, 22 | SUM(INTEGER(alt = call.genotype)) WITHIN RECORD AS alt_count, 23 | alt_freq_from_1KG 24 | FROM 25 | FLATTEN( 26 | FLATTEN(( 27 | SELECT 28 | reference_name, 29 | start, 30 | reference_bases, 31 | alternate_bases, 32 | POSITION(alternate_bases) AS alt, 33 | af AS alt_freq_from_1KG, 34 | call.call_set_name, 35 | call.genotype, 36 | FROM 37 | [genomics-public-data:1000_genomes.variants] 38 | WHERE 39 | reference_name = '17' 40 | AND start BETWEEN 41196311 41 | AND 41277499 42 | AND vt='SNP' 43 | ), 44 | call), 45 | alt)) 46 | GROUP BY 47 | reference_name, 48 | start, 49 | reference_bases, 50 | alternate_bases, 51 | alt, 52 | alt_freq_from_1KG 53 | ORDER BY 54 | reference_name, 55 | start, 56 | reference_bases, 57 | alt, 58 | alternate_bases 59 | -------------------------------------------------------------------------------- /1000genomes/sql/allelic-frequency-by-gender.sql: -------------------------------------------------------------------------------- 1 | # The following query computes the allelic frequency for BRCA1 variants in the 2 | # 1,000 Genomes dataset further classified by gender from the phenotypic data. 3 | SELECT 4 | reference_name, 5 | start, 6 | gender, 7 | reference_bases, 8 | alternate_bases 9 | alt, 10 | SUM(ref_count)+SUM(alt_count) AS num_sample_alleles, 11 | SUM(ref_count) AS ref_cnt, 12 | SUM(alt_count) AS alt_cnt, 13 | SUM(ref_count)/(SUM(ref_count)+SUM(alt_count)) AS ref_freq, 14 | SUM(alt_count)/(SUM(ref_count)+SUM(alt_count)) AS alt_freq, 15 | FROM ( 16 | SELECT 17 | reference_name, 18 | start, 19 | gender, 20 | reference_bases, 21 | alternate_bases, 22 | alt, 23 | SUM(INTEGER(0 = call.genotype)) WITHIN RECORD AS ref_count, 24 | SUM(INTEGER(alt = call.genotype)) WITHIN RECORD AS alt_count 25 | FROM 26 | FLATTEN(FLATTEN(( 27 | SELECT 28 | reference_name, 29 | start, 30 | reference_bases, 31 | alternate_bases, 32 | POSITION(alternate_bases) AS alt, 33 | call.call_set_name, 34 | call.genotype, 35 | FROM 36 | [genomics-public-data:1000_genomes.variants] 37 | WHERE 38 | reference_name = '17' 39 | AND start BETWEEN 41196311 40 | AND 41277499 41 | AND vt='SNP' 42 | ), 43 | call), 44 | alt) AS g 45 | JOIN 46 | [genomics-public-data:1000_genomes.sample_info] p 47 | ON 48 | g.call.call_set_name = p.sample) 49 | GROUP BY 50 | reference_name, 51 | start, 52 | gender, 53 | reference_bases, 54 | alternate_bases, 55 | alt 56 | ORDER BY 57 | reference_name, 58 | start, 59 | gender, 60 | reference_bases, 61 | alt, 62 | alternate_bases 63 | -------------------------------------------------------------------------------- /1000genomes/sql/allelic-frequency-by-ethnicity.sql: -------------------------------------------------------------------------------- 1 | # The following query computes the allelic frequency for BRCA1 variants in the 2 | # 1,000 Genomes dataset further classified by ethnicity from the phenotypic data. 3 | SELECT 4 | reference_name, 5 | start, 6 | population, 7 | reference_bases, 8 | alternate_bases 9 | alt, 10 | SUM(ref_count)+SUM(alt_count) AS num_sample_alleles, 11 | SUM(ref_count) AS ref_cnt, 12 | SUM(alt_count) AS alt_cnt, 13 | SUM(ref_count)/(SUM(ref_count)+SUM(alt_count)) AS ref_freq, 14 | SUM(alt_count)/(SUM(ref_count)+SUM(alt_count)) AS alt_freq, 15 | FROM ( 16 | SELECT 17 | reference_name, 18 | start, 19 | population, 20 | reference_bases, 21 | alternate_bases, 22 | alt, 23 | SUM(INTEGER(0 = call.genotype)) WITHIN RECORD AS ref_count, 24 | SUM(INTEGER(alt = call.genotype)) WITHIN RECORD AS alt_count 25 | FROM 26 | FLATTEN(FLATTEN(( 27 | SELECT 28 | reference_name, 29 | start, 30 | reference_bases, 31 | alternate_bases, 32 | POSITION(alternate_bases) AS alt, 33 | call.call_set_name, 34 | call.genotype, 35 | FROM 36 | [genomics-public-data:1000_genomes.variants] 37 | WHERE 38 | reference_name = '17' 39 | AND start BETWEEN 41196311 40 | AND 41277499 41 | AND vt='SNP' 42 | ), 43 | call), 44 | alt) AS g 45 | JOIN 46 | [genomics-public-data:1000_genomes.sample_info] p 47 | ON 48 | g.call.call_set_name = p.sample) 49 | GROUP BY 50 | reference_name, 51 | start, 52 | population, 53 | reference_bases, 54 | alternate_bases, 55 | alt 56 | ORDER BY 57 | reference_name, 58 | start, 59 | population, 60 | reference_bases, 61 | alt, 62 | alternate_bases 63 | -------------------------------------------------------------------------------- /pgp/provenance/gvcf-expand-mapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2014 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """A mapper for expansion of gVCF data. 18 | """ 19 | 20 | import json 21 | import sys 22 | 23 | from gvcf_expander import GvcfExpander 24 | 25 | 26 | def main(): 27 | """Entry point to the script.""" 28 | 29 | # Basic parsing of command line arguments to allow a filename 30 | # to be passed when running this code in the debugger. 31 | file_handle = sys.stdin 32 | if 2 <= len(sys.argv): 33 | file_handle = open(sys.argv[1], "r") 34 | 35 | expander = GvcfExpander() 36 | 37 | line = file_handle.readline() 38 | while line: 39 | line = line.strip() 40 | if not line: 41 | line = file_handle.readline() 42 | continue 43 | 44 | fields = json.loads(line) 45 | 46 | pairs = expander.map(fields=fields) 47 | for pair in pairs: 48 | emit(pair.k, pair.v) 49 | 50 | line = file_handle.readline() 51 | 52 | 53 | def emit(key, fields): 54 | """Emits a key/value pair to stdout. 55 | 56 | Args: 57 | key: (string) 58 | fields: (dictionary) 59 | 60 | Returns: n/a 61 | 62 | Side Effects: 63 | a VCF line is written to stdout 64 | """ 65 | print "%s\t%s" % (key, json.dumps(fields)) 66 | 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | bigquery-examples 2 | ================= 3 | 4 | The data stories and queries in this repository demonstrate working with genomic data via [Google BigQuery](https://cloud.google.com/bigquery/). All examples are built upon public datasets. 5 | 6 | Have other data stories you would like to see here? Have any data stories you would like to *share*? Have *corrections to the biology* covered in this material? Have query *simplifications* or *speed improvements*? Let us know by [filing an issue](https://github.com/googlegenomics/bigquery-examples/issues) or [contacting us directly](mailto:google-genomics-contact@googlegroups.com). 7 | 8 | Getting Started 9 | ----------------- 10 | 11 | If you are new to BigQuery, start here instead: [Analyze Variants Using BigQuery](https://cloud.google.com/genomics/v1/analyze-variants). 12 | 13 | Otherwise, navigate through the tree of content in this repository. You will find queries, RMarkdown, rendered analyses, and provenance details. 14 | 15 | Loading your own Variant Data into BigQuery 16 | ------------------------------------------- 17 | 18 | After trying these queries on public data, you can [load your own variant data into BigQuery](https://cloud.google.com/genomics/v1/load-variants). 19 | 20 | For other types of data, such as variant annotations, see [Preparing Data for BigQuery](https://cloud.google.com/bigquery/preparing-data-for-bigquery) and also [BigQuery in Practice : Loading Data Sets That are Terabytes and Beyond](https://cloud.google.com/developers/articles/bigquery-in-practice) for more detail. 21 | 22 | The mailing list 23 | ---------------- 24 | 25 | The [Google Genomics Discuss mailing list](https://groups.google.com/forum/#!forum/google-genomics-discuss) is a good 26 | way to sync up with other people who use googlegenomics including the core developers. You can subscribe 27 | by sending an email to ``google-genomics-discuss+subscribe@googlegroups.com`` or just post using 28 | the [web forum page](https://groups.google.com/forum/#!forum/google-genomics-discuss). 29 | -------------------------------------------------------------------------------- /pgp/sql/gvcf_variants/allele-count.sql: -------------------------------------------------------------------------------- 1 | # Count the occurence of each variant allele across all participants in the 2 | # dataset. This returns a large result so be sure to materialize it into a 3 | # table for subsequent use. 4 | # 5 | # Note that the new BigQuery feature of user-defined javascript 6 | # functions is in limited preview. For more info, see 7 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377 8 | SELECT 9 | contig_name, 10 | start_pos, 11 | # This 'bin' can be use in subsequent interval JOINs 12 | INTEGER(FLOOR(start_pos / 5000)) AS bin, 13 | reference_bases, 14 | alternate_bases, 15 | SUM(alternate_allele_count) AS alternate_allele_count, 16 | FROM ( 17 | SELECT contig_name, start_pos, reference_bases, alternate_bases, alt_count 18 | FROM js( 19 | [google.com:biggene:pgp.gvcf_variants], 20 | contig_name, start_pos, reference_bases, alternate_bases, call.genotype, 21 | "[{name: 'contig_name', type: 'string'}, 22 | {name: 'start_pos', type: 'integer'}, 23 | {name: 'reference_bases', type: 'string'}, 24 | {name: 'alternate_bases', type: 'string'}, 25 | {name: 'alternate_allele_count', type: 'integer'}]", 26 | "function(r, emit) { 27 | for(var a in r.alternate_bases) { 28 | var alt_gt = a + 1; 29 | var alt_count = 0; 30 | for(var c in r.call) { 31 | for(var g in r.call[c].genotype) { 32 | if(alt_gt == r.call[c].genotype[g]) { 33 | alt_count++; 34 | } 35 | } 36 | } 37 | // Emit one record per alt 38 | emit({ 39 | contig_name: r.contig_name, 40 | start_pos: r.start_pos, 41 | reference_bases: r.reference_bases, 42 | alternate_bases: r.alternate_bases[a], 43 | alternate_allele_count: alt_count 44 | }); 45 | } 46 | }")) 47 | GROUP EACH BY 48 | contig_name, 49 | start_pos, 50 | bin, 51 | reference_bases, 52 | alternate_bases 53 | -------------------------------------------------------------------------------- /pgp/sql/schema-comparisons/sample-call-counts.sql: -------------------------------------------------------------------------------- 1 | # Sample call counts for the PGP data encoded several different ways. 2 | # NOTE: table pgp.variants was left out of this example since its more trouble 3 | # than its worth to parse the GT field into its components. 4 | SELECT 5 | sample_id, 6 | num_records, 7 | num_variant_alleles, 8 | dataset 9 | FROM 10 | ( 11 | SELECT 12 | sample_id, 13 | COUNT(sample_id) AS num_records, 14 | INTEGER(SUM(allele1_is_variant + allele2_is_variant)) AS num_variant_alleles, 15 | 'cgi_variants' AS dataset 16 | FROM ( 17 | SELECT 18 | sample_id, 19 | allele1Seq != reference 20 | AND allele1Seq != '=' 21 | AND allele1Seq != '?' AS allele1_is_variant, 22 | allele2Seq != reference 23 | AND allele2Seq != '=' 24 | AND allele2Seq != '?' AS allele2_is_variant, 25 | FROM 26 | [google.com:biggene:pgp.cgi_variants] 27 | # Skip the genomes we were unable to convert to VCF/gVCF 28 | OMIT 29 | RECORD IF 30 | sample_id = 'huEDF7DA' 31 | OR sample_id = 'hu34D5B9') 32 | GROUP BY 33 | sample_id), 34 | ( 35 | SELECT 36 | sample_id, 37 | COUNT(sample_id) AS num_records, 38 | INTEGER(SUM(num_variant_alleles)) AS num_variant_alleles, 39 | 'gvcf_variants' AS dataset 40 | FROM ( 41 | SELECT 42 | call.callset_name AS sample_id, 43 | SUM(call.genotype > 0) WITHIN call AS num_variant_alleles, 44 | FROM 45 | [google.com:biggene:pgp.gvcf_variants]) 46 | GROUP BY 47 | sample_id), 48 | ( 49 | SELECT 50 | sample_id, 51 | COUNT(sample_id) AS num_records, 52 | INTEGER(SUM(num_variant_alleles)) AS num_variant_alleles, 53 | 'gvcf_variants_expanded' AS dataset 54 | FROM 55 | ( 56 | SELECT 57 | call.callset_name AS sample_id, 58 | SUM(call.genotype > 0) WITHIN call AS num_variant_alleles, 59 | FROM 60 | [google.com:biggene:test.pgp_gvcf_variants_expanded2]) 61 | GROUP BY 62 | sample_id) 63 | ORDER BY 64 | sample_id, 65 | dataset 66 | -------------------------------------------------------------------------------- /1000genomes/sql/gender-het-hom-ratio.sql: -------------------------------------------------------------------------------- 1 | # The following query uses the homozygous and heterozygous variant counts within 2 | # chromosome X to help determine whether the gender phenotype values are correct 3 | # for the samples. 4 | SELECT 5 | sample_id, 6 | gender, 7 | reference_name, 8 | (hom_AA_count + het_RA_count + hom_RR_count) AS all_callable_sites, 9 | hom_AA_count, 10 | het_RA_count, 11 | hom_RR_count, 12 | (hom_AA_count + het_RA_count) AS all_snvs, 13 | ROUND((het_RA_count/(hom_AA_count + het_RA_count))*1000)/1000 AS perct_het_alt_in_snvs, 14 | ROUND((hom_AA_count/(hom_AA_count + het_RA_count))*1000)/1000 AS perct_hom_alt_in_snvs 15 | FROM 16 | ( 17 | SELECT 18 | reference_name, 19 | sample_id, 20 | SUM(IF(0 = first_allele 21 | AND 0 = second_allele, 22 | 1, 23 | 0)) AS hom_RR_count, 24 | SUM(IF(first_allele = second_allele 25 | AND first_allele > 0, 26 | 1, 27 | 0)) AS hom_AA_count, 28 | SUM(IF((first_allele != second_allele OR second_allele IS NULL) 29 | AND (first_allele > 0 30 | OR second_allele > 0), 31 | 1, 32 | 0)) AS het_RA_count 33 | FROM ( 34 | SELECT 35 | reference_name, 36 | call.call_set_name AS sample_id, 37 | NTH(1, 38 | call.genotype) WITHIN call AS first_allele, 39 | NTH(2, 40 | call.genotype) WITHIN call AS second_allele, 41 | FROM 42 | [genomics-public-data:1000_genomes.variants] 43 | WHERE 44 | reference_name = 'X' 45 | AND vt = 'SNP' 46 | AND start NOT BETWEEN 59999 47 | AND 2699519 48 | AND start NOT BETWEEN 154931042 49 | AND 155260559) 50 | GROUP BY 51 | sample_id, 52 | reference_name 53 | ) AS g 54 | JOIN 55 | [genomics-public-data:1000_genomes.sample_info] p 56 | ON 57 | g.sample_id = p.sample 58 | GROUP BY 59 | sample_id, 60 | gender, 61 | reference_name, 62 | all_callable_sites, 63 | hom_AA_count, 64 | het_RA_count, 65 | hom_RR_count, 66 | all_snvs, 67 | perct_het_alt_in_snvs, 68 | perct_hom_alt_in_snvs 69 | ORDER BY 70 | perct_het_alt_in_snvs DESC, 71 | sample_id 72 | 73 | -------------------------------------------------------------------------------- /pgp/provenance/gvcf-expand-reducer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2014 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """A reducer for expansion of gVCF data. 18 | """ 19 | 20 | import json 21 | import os 22 | import sys 23 | 24 | from gvcf_expander import GvcfExpander 25 | from gvcf_expander import Pair 26 | 27 | FILTER_ENV_KEY = "FILTER_REF_MATCHES" 28 | 29 | 30 | def main(): 31 | """Entry point to the script.""" 32 | 33 | # Basic parsing of command line arguments to allow a filename 34 | # to be passed when running this code in the debugger. 35 | file_handle = sys.stdin 36 | if 2 <= len(sys.argv): 37 | file_handle = open(sys.argv[1], "r") 38 | 39 | if FILTER_ENV_KEY in os.environ: 40 | expander = GvcfExpander(filter_ref_matches=True) 41 | else: 42 | expander = GvcfExpander() 43 | 44 | line = file_handle.readline() 45 | while line: 46 | line = line.strip() 47 | if not line: 48 | line = file_handle.readline() 49 | continue 50 | 51 | (key, value) = line.split("\t") 52 | fields = json.loads(value) 53 | results = expander.reduce(pair=Pair(key, fields)) 54 | 55 | for result in results: 56 | emit(result) 57 | 58 | line = file_handle.readline() 59 | 60 | results = expander.finalize() 61 | 62 | for result in results: 63 | emit(result) 64 | 65 | 66 | def emit(fields): 67 | """Emits a reduced value to stdout. 68 | 69 | Args: 70 | fields: (dict) 71 | 72 | Returns: n/a 73 | 74 | Side Effects: 75 | a value is written to stdout 76 | """ 77 | print "%s" % (json.dumps(fields)) 78 | 79 | 80 | if __name__ == "__main__": 81 | main() 82 | -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-allelic-frequencies/reproducing-allelic-frequency-by-ethnicity.sql: -------------------------------------------------------------------------------- 1 | # The following query computes the allelic frequency for BRCA1 variants in the 2 | # 1,000 Genomes dataset further classified by ethnicity from the phenotypic data 3 | # and also includes the pre-computed value from the dataset. 4 | SELECT 5 | reference_name, 6 | start, 7 | super_population, 8 | reference_bases, 9 | alternate_bases, 10 | SUM(ref_count)+SUM(alt_count) AS num_sample_alleles, 11 | SUM(ref_count) AS sample_allele_ref_cnt, 12 | SUM(alt_count) AS sample_allele_alt_cnt, 13 | SUM(ref_count)/(SUM(ref_count)+SUM(alt_count)) AS ref_freq, 14 | SUM(alt_count)/(SUM(ref_count)+SUM(alt_count)) AS alt_freq, 15 | alt_freq_from_1KG 16 | FROM ( 17 | SELECT 18 | reference_name, 19 | start, 20 | super_population, 21 | reference_bases, 22 | alternate_bases, 23 | alt, 24 | SUM(INTEGER(0 = call.genotype)) WITHIN RECORD AS ref_count, 25 | SUM(INTEGER(alt = call.genotype)) WITHIN RECORD AS alt_count, 26 | CASE 27 | WHEN super_population = 'EAS' 28 | THEN asn_af 29 | WHEN super_population= 'EUR' 30 | THEN eur_af 31 | WHEN super_population = 'AFR' 32 | THEN afr_af 33 | WHEN super_population = 'AMR' 34 | THEN amr_af 35 | END AS alt_freq_from_1KG 36 | FROM 37 | FLATTEN(FLATTEN(( 38 | SELECT 39 | reference_name, 40 | start, 41 | reference_bases, 42 | alternate_bases, 43 | POSITION(alternate_bases) AS alt, 44 | call.call_set_name, 45 | call.genotype, 46 | afr_af, 47 | amr_af, 48 | asn_af, 49 | eur_af, 50 | FROM 51 | [genomics-public-data:1000_genomes.variants] 52 | WHERE 53 | reference_name = '17' 54 | AND start BETWEEN 41196311 55 | AND 41277499 56 | AND vt='SNP' 57 | ), 58 | call), 59 | alt) AS g 60 | JOIN 61 | [genomics-public-data:1000_genomes.sample_info] p 62 | ON 63 | g.call.call_set_name = p.sample) 64 | GROUP BY 65 | reference_name, 66 | start, 67 | super_population, 68 | reference_bases, 69 | alternate_bases, 70 | alt_freq_from_1KG 71 | ORDER BY 72 | reference_name, 73 | start, 74 | super_population 75 | -------------------------------------------------------------------------------- /pgp/sql/gvcf_variants/ti-tv-ratio.sql: -------------------------------------------------------------------------------- 1 | # Compute the Ti/Tv ratio for each participant in the PGP dataset. A user-defined 2 | # function is used here since its difficult in SQL to join the genotype array in 3 | # each call with alternate_bases at the variant level. 4 | # 5 | # Note that the new BigQuery feature of user-defined javascript 6 | # functions is in limited preview. For more info, see 7 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377 8 | SELECT 9 | sample_id, 10 | transitions, 11 | transversions, 12 | transitions/transversions AS titv 13 | FROM ( 14 | SELECT 15 | sample_id, 16 | SUM(IF(mutation IN ('A->G', 17 | 'G->A', 18 | 'C->T', 19 | 'T->C'), 20 | 1, 21 | 0)) AS transitions, 22 | SUM(IF(mutation IN ('A->C', 23 | 'C->A', 24 | 'G->T', 25 | 'T->G', 26 | 'A->T', 27 | 'T->A', 28 | 'C->G', 29 | 'G->C'), 30 | 1, 31 | 0)) AS transversions, 32 | FROM ( 33 | SELECT sample_id, mutation 34 | FROM js( 35 | [google.com:biggene:pgp.gvcf_variants], 36 | reference_bases, alternate_bases, call.callset_name, call.genotype, 37 | "[{name: 'sample_id', type: 'string'}, 38 | {name: 'mutation', type: 'string'}]", 39 | "function(r, emit) { 40 | var hasSNP = false; 41 | var isSNP = [false]; 42 | for(var i in r.alternate_bases) { 43 | if(1 == r.alternate_bases[i].length) { 44 | isSNP[isSNP.length] = true; 45 | hasSNP = true; 46 | } 47 | else { 48 | isSNP[isSNP.length] = false; 49 | } 50 | } 51 | if (hasSNP && 1 == r.reference_bases.length) { 52 | for(var i in r.call) { 53 | for(var j in r.call[i].genotype) { 54 | if(0 < r.call[i].genotype[j] && isSNP[r.call[i].genotype[j]]) { 55 | emit({ 56 | sample_id: r.call[i].callset_name, 57 | mutation: r.reference_bases + '->' + r.alternate_bases[r.call[i].genotype[j] - 1] 58 | }); 59 | } 60 | } 61 | } 62 | } 63 | }")) 64 | GROUP BY 65 | sample_id) 66 | ORDER BY 67 | titv DESC 68 | -------------------------------------------------------------------------------- /pgp/sql/gvcf_variants_expanded/ti-tv-ratio.sql: -------------------------------------------------------------------------------- 1 | # Compute the Ti/Tv ratio for each participant in the PGP dataset. A user-defined 2 | # function is used here since its difficult in SQL to join the genotype array in 3 | # each call with alternate_bases at the variant level. 4 | # 5 | # Note that the new BigQuery feature of user-defined javascript 6 | # functions is in limited preview. For more info, see 7 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377 8 | SELECT 9 | sample_id, 10 | transitions, 11 | transversions, 12 | transitions/transversions AS titv 13 | FROM ( 14 | SELECT 15 | sample_id, 16 | SUM(IF(mutation IN ('A->G', 17 | 'G->A', 18 | 'C->T', 19 | 'T->C'), 20 | 1, 21 | 0)) AS transitions, 22 | SUM(IF(mutation IN ('A->C', 23 | 'C->A', 24 | 'G->T', 25 | 'T->G', 26 | 'A->T', 27 | 'T->A', 28 | 'C->G', 29 | 'G->C'), 30 | 1, 31 | 0)) AS transversions, 32 | FROM ( 33 | SELECT sample_id, mutation 34 | FROM js( 35 | [google.com:biggene:test.pgp_gvcf_variants_expanded], 36 | reference_bases, alternate_bases, call.callset_name, call.genotype, 37 | "[{name: 'sample_id', type: 'string'}, 38 | {name: 'mutation', type: 'string'}]", 39 | "function(r, emit) { 40 | var hasSNP = false; 41 | var isSNP = [false]; 42 | for(var i in r.alternate_bases) { 43 | if(1 == r.alternate_bases[i].length) { 44 | isSNP[isSNP.length] = true; 45 | hasSNP = true; 46 | } 47 | else { 48 | isSNP[isSNP.length] = false; 49 | } 50 | } 51 | if (hasSNP && 1 == r.reference_bases.length) { 52 | for(var i in r.call) { 53 | for(var j in r.call[i].genotype) { 54 | if(0 < r.call[i].genotype[j] && isSNP[r.call[i].genotype[j]]) { 55 | emit({ 56 | sample_id: r.call[i].callset_name, 57 | mutation: r.reference_bases + '->' + r.alternate_bases[r.call[i].genotype[j] - 1] 58 | }); 59 | } 60 | } 61 | } 62 | } 63 | }") 64 | ) 65 | GROUP BY 66 | sample_id) 67 | ORDER BY 68 | titv DESC 69 | -------------------------------------------------------------------------------- /pgp/sql/gvcf_variants/allelic-frequency-brca1-no-udf.sql: -------------------------------------------------------------------------------- 1 | # The following query computes the allelic frequency for BRCA1 variants in the 2 | # PGP dataset _without_ using a user-defined function. 3 | # 4 | # Since without UDFs we cannot _count the other reference calls just assume 5 | # the total number of alleles IS number of samples times 2 (thereby losing the 6 | # distinction _between reference calls _and no-calls unfortunately) 7 | SELECT 8 | contig_name, 9 | start_pos, 10 | reference_bases, 11 | alt AS allele, 12 | (174 * 2) AS num_alleles_called, 13 | ROUND(alt_allele_count / (174 * 2), 14 | 4) AS freq, 15 | FROM ( 16 | SELECT 17 | contig_name, 18 | start_pos, 19 | reference_bases, 20 | alt, 21 | SUM(ref_allele_count) AS ref_allele_count, 22 | SUM(alt_allele_count) AS alt_allele_count, 23 | SUM(other_alt_allele_count) AS other_alt_allele_count, 24 | FROM ( 25 | SELECT 26 | contig_name, 27 | start_pos, 28 | reference_bases, 29 | NTH(1, 30 | alternate_bases) WITHIN RECORD AS alt, 31 | SUM(IF(0 = call.genotype, 32 | 1, 33 | 0)) WITHIN RECORD AS ref_allele_count, 34 | SUM(IF(1 = call.genotype, 35 | 1, 36 | 0)) WITHIN RECORD AS alt_allele_count, 37 | SUM(IF(0 != call.genotype 38 | AND 1 != call.genotype, 39 | 1, 40 | 0)) WITHIN RECORD AS other_alt_allele_count, 41 | FROM 42 | [google.com:biggene:pgp.gvcf_variants] 43 | WHERE 44 | reference_bases != 'N' 45 | AND contig_name = '17' 46 | AND start_pos BETWEEN 41196312 47 | AND 41277500 48 | ), 49 | ( 50 | SELECT 51 | contig_name, 52 | start_pos, 53 | reference_bases, 54 | NTH(2, 55 | alternate_bases) WITHIN RECORD AS alt, 56 | SUM(IF(0 = call.genotype, 57 | 1, 58 | 0)) WITHIN RECORD AS ref_allele_count, 59 | SUM(IF(2 = call.genotype, 60 | 1, 61 | 0)) WITHIN RECORD AS alt_allele_count, 62 | SUM(IF(0 != call.genotype 63 | AND 2 != call.genotype, 64 | 1, 65 | 0)) WITHIN RECORD AS other_alt_allele_count, 66 | FROM 67 | [google.com:biggene:pgp.gvcf_variants] 68 | WHERE 69 | reference_bases != 'N' 70 | AND contig_name = '17' 71 | AND start_pos BETWEEN 41196312 72 | AND 41277500 73 | ) 74 | WHERE 75 | alt IS NOT NULL 76 | GROUP BY 77 | contig_name, 78 | start_pos, 79 | reference_bases, 80 | alt 81 | ) -------------------------------------------------------------------------------- /1000genomes/sql/shared-variant-counts-by-ethnicity.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | -- 3 | -- We'd like to see how the members of each super population share variation. 4 | -- 5 | -- Let's generate a table where the records indicate: 6 | -- 7 | -- For the variants that appear in a given super-population: 8 | -- how many variants are singletons (not shared)? 9 | -- how many variants are shared by exactly 2 individuals? 10 | -- how many variants are shared by exactly 3 individuals? 11 | -- etc ... 12 | -- how many variants are shared by all members of the super population? 13 | -- 14 | -- The variants and counts are further partitioned by whether the variant is common or rare. 15 | -- 16 | WITH 17 | population_counts AS ( 18 | SELECT 19 | super_population, 20 | COUNT(population) AS super_population_count 21 | FROM 22 | `genomics-public-data.1000_genomes.sample_info` 23 | WHERE 24 | In_Phase1_Integrated_Variant_Set = TRUE 25 | GROUP BY 26 | super_population), 27 | -- 28 | autosome_calls AS ( 29 | SELECT 30 | reference_name, 31 | start, 32 | `end`, 33 | reference_bases, 34 | alternate_bases[ORDINAL(1)] AS alt, -- 1000 Genomes is biallelic. 35 | vt, 36 | af IS NOT NULL 37 | AND af >= 0.05 AS is_common_variant, 38 | call.call_set_name, 39 | super_population 40 | FROM 41 | `genomics-public-data.1000_genomes.variants` AS v, v.call AS call 42 | JOIN 43 | `genomics-public-data.1000_genomes.sample_info` AS p 44 | ON 45 | call.call_set_name = p.sample 46 | WHERE 47 | reference_name NOT IN ("X", "Y", "MT") 48 | AND EXISTS (SELECT gt FROM UNNEST(call.genotype) gt WHERE gt > 0)), 49 | -- 50 | super_population_autosome_variants AS ( 51 | SELECT 52 | reference_name, 53 | start, 54 | `end`, 55 | reference_bases, 56 | alt, 57 | vt, 58 | super_population, 59 | is_common_variant, 60 | COUNT(call_set_name) AS num_samples 61 | FROM 62 | autosome_calls 63 | GROUP BY 64 | reference_name, 65 | start, 66 | `end`, 67 | reference_bases, 68 | alt, 69 | vt, 70 | super_population, 71 | is_common_variant ) 72 | -- 73 | -- 74 | SELECT 75 | p.super_population AS super_population, 76 | super_population_count, 77 | is_common_variant, 78 | num_samples, 79 | num_samples / super_population_count AS percent_samples, 80 | COUNT(1) AS num_variants_shared_by_this_many_samples 81 | FROM 82 | super_population_autosome_variants AS v 83 | JOIN population_counts AS p 84 | ON 85 | v.super_population = p.super_population 86 | GROUP BY 87 | super_population, 88 | super_population_count, 89 | is_common_variant, 90 | num_samples, 91 | percent_samples 92 | ORDER BY 93 | num_samples, 94 | super_population, 95 | is_common_variant 96 | -------------------------------------------------------------------------------- /pgp/sql/schema-comparisons/missingness-brca1.sql: -------------------------------------------------------------------------------- 1 | # Missingness rate for variants within BRCA1. 2 | SELECT 3 | vars.contig_name AS contig_name, 4 | vars.start_pos AS start_pos, 5 | reference_bases, 6 | variant_called_count, 7 | SUM(refs.called_count) AS reference_called_count, 8 | variant_called_count + SUM(refs.called_count) AS num_alleles_called_for_position, 9 | 1 - ((variant_called_count + SUM(refs.called_count))/(172*2)) AS missingness_rate 10 | FROM ( 11 | # _JOIN our variant sample counts with the corresponding reference-matching blocks 12 | SELECT 13 | vars.contig_name, 14 | vars.start_pos, 15 | refs.start_pos, 16 | vars.end_pos, 17 | refs.END, 18 | reference_bases, 19 | variant_called_count, 20 | refs.called_count 21 | FROM ( 22 | # Constrain the left hand side of the _JOIN to reference-matching blocks 23 | SELECT 24 | contig_name, 25 | start_pos, 26 | END, 27 | IF(alternate_bases IS NULL, 28 | FALSE, 29 | TRUE) AS is_variant_call, 30 | SUM(call.genotype >= 0) WITHIN RECORD AS called_count, 31 | FROM 32 | [google.com:biggene:pgp.gvcf_variants] 33 | WHERE 34 | contig_name = '17' 35 | HAVING 36 | is_variant_call = FALSE) AS refs 37 | JOIN ( 38 | # Constrain the right hand side of the _JOIN to variants 39 | # _GROUP our variant sample counts together since a single SNP may be IN more than 40 | # one row due 1 / 2 genotypes 41 | SELECT 42 | contig_name, 43 | start_pos, 44 | end_pos, 45 | reference_bases, 46 | SUM(called_count) AS variant_called_count, 47 | FROM ( 48 | # _LIMIT the query to SNPs _ON chromosome 17 WITHIN BRCA1 49 | SELECT 50 | contig_name, 51 | start_pos, 52 | end_pos, 53 | reference_bases, 54 | LENGTH(reference_bases) AS ref_len, 55 | MIN(LENGTH(alternate_bases)) WITHIN RECORD AS alt_len, 56 | IF(alternate_bases IS NULL, 57 | FALSE, 58 | TRUE) AS is_variant_call, 59 | SUM(call.genotype >= 0) WITHIN RECORD AS called_count, 60 | FROM 61 | [google.com:biggene:pgp.gvcf_variants] 62 | WHERE 63 | contig_name = '17' 64 | AND start_pos BETWEEN 41196312 65 | AND 41277500 66 | HAVING 67 | ref_len = 1 68 | AND alt_len = 1 69 | AND is_variant_call) 70 | GROUP BY 71 | contig_name, 72 | start_pos, 73 | end_pos, 74 | reference_bases) AS vars 75 | # The _JOIN criteria IS complicated since we are trying to see if a SNP overlaps an interval 76 | ON 77 | vars.contig_name = refs.contig_name 78 | WHERE 79 | refs.start_pos <= vars.start_pos 80 | AND refs.END >= vars.end_pos 81 | ) 82 | GROUP BY 83 | contig_name, 84 | start_pos, 85 | reference_bases, 86 | variant_called_count 87 | -------------------------------------------------------------------------------- /pgp/sql/gvcf_variants_expanded/allelic-frequency.sql: -------------------------------------------------------------------------------- 1 | # This is busted. It over counts ref calls due in the GROUP BY operation. It 2 | # would work if we grouped all of the same variant into the same row prior to 3 | # loading to BigQuery because then we would not need the GROUP BY operation. 4 | # 5 | # Note that the new BigQuery feature of user-defined javascript 6 | # functions is in limited preview. For more info, see 7 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377 8 | SELECT 9 | contig_name, 10 | start_pos, 11 | reference_bases, 12 | alternate_bases, 13 | ref_count + alt_count + other_count AS num_sample_alleles, 14 | alt_count/(ref_count + alt_count + other_count) AS alt_freq, 15 | FROM ( 16 | SELECT 17 | contig_name, 18 | start_pos, 19 | reference_bases, 20 | alternate_bases, 21 | SUM(alt_count) AS alt_count, 22 | SUM(ref_count) AS ref_count, 23 | SUM(other_count) AS other_count, 24 | FROM ( 25 | SELECT contig_name, start_pos, reference_bases, alternate_bases, alt_count, ref_count, other_count 26 | FROM js( 27 | [google.com:biggene:test.pgp_gvcf_variants_expanded], 28 | contig_name, start_pos, reference_bases, alternate_bases, call.genotype, 29 | "[{name: 'contig_name', type: 'string'}, 30 | {name: 'start_pos', type: 'integer'}, 31 | {name: 'reference_bases', type: 'string'}, 32 | {name: 'alternate_bases', type: 'string'}, 33 | {name: 'alt_count', type: 'integer'}, 34 | {name: 'ref_count', type: 'integer'}, 35 | {name: 'other_count', type: 'integer'}]", 36 | "function(r, emit) { 37 | for(var a in r.alternate_bases) { 38 | var alt_gt = a + 1; 39 | var ref_count = 0; 40 | var alt_count = 0; 41 | var other_count = 0; 42 | for(var c in r.call) { 43 | for(var g in r.call[c].genotype) { 44 | if(0 > r.call[c].genotype[g]) { 45 | // Don't count no-calls 46 | continue; 47 | } else if (0 == r.call[c].genotype[g]) { 48 | ref_count++; 49 | } else if (alt_gt == r.call[c].genotype[g]) { 50 | alt_count++; 51 | } else { 52 | other_count++; 53 | } 54 | } 55 | } 56 | // Emit one record per alt 57 | emit({ 58 | contig_name: r.contig_name, 59 | start_pos: r.start_pos, 60 | reference_bases: r.reference_bases, 61 | alternate_bases: r.alternate_bases[a], 62 | alt_count: alt_count, 63 | ref_count: ref_count, 64 | other_count: other_count 65 | }); 66 | } 67 | }")) 68 | GROUP EACH BY 69 | contig_name, 70 | start_pos, 71 | reference_bases, 72 | alternate_bases) 73 | -------------------------------------------------------------------------------- /pgp/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | Personal Genomes Project 18 | ================= 19 | 20 | ### Additional Resources 21 | * [Schema](https://bigquery.cloud.google.com/table/google.com:biggene:pgp.variants?pli=1) 22 | * [Provenance](./provenance) 23 | * [Data Stories](./data-stories) such as 24 | * [Comparing PGP to 1000 Genomes](./data-stories/comparing-pgp-to-1000genomes) 25 | * [Issues with the Variant-Centric Approach](./data-stories/issues-with-the-variant-centric-approach) 26 | 27 | 28 | 29 | 30 | **See [PGP Public data](http://googlegenomics.readthedocs.org/en/latest/use_cases/discover_public_data/pgp_public_data.html#bigquery-pgp-tables) for provenance details of the most recent import of the PGP data which has the up-to-date schema.** The other tables you see here comprise a variety of schema experiments. Some of the column names for common data may differ from those of your own variants data exported to BigQuery. 31 | 32 | Here is an initial query joining the variant data with the phenotypic data. See the [phenotypes schema](https://bigquery.cloud.google.com/table/google.com:biggene:pgp.phenotypes?pli=1) for more detail. 33 | 34 | 35 | ``` 36 | # Compute sample count by gender 37 | SELECT 38 | Sex_Gender, 39 | COUNT(1) AS cnt 40 | FROM 41 | ( 42 | SELECT 43 | call.callset_name, 44 | Sex_Gender 45 | FROM 46 | FLATTEN([google.com:biggene:pgp.variants], 47 | call) AS var 48 | JOIN 49 | [google.com:biggene:pgp.phenotypes] AS pheno 50 | ON 51 | pheno.Participant = var.call.callset_name 52 | GROUP BY 53 | call.callset_name, 54 | Sex_Gender) 55 | GROUP BY 56 | Sex_Gender Running query: RUNNING 2.1s 57 | ``` 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 |
Sex_Gender cnt
Female 53
Male 112
6
67 | 68 | plot of chunk gender 69 | -------------------------------------------------------------------------------- /pgp/provenance/cgi-header-mapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2014 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """Count header values found within CGI files. 18 | 19 | Assumptions: 20 | - one sample per input file 21 | 22 | This script can be run standalone: 23 | cat masterVarBeta-GS000010426-ASM.tsv | ./cgi-header-mapper.py 24 | 25 | Or via the debugger: 26 | python -mpdb ./cgi-header-mapper.py masterVarBeta-GS000010426-ASM.tsv 27 | 28 | It can also be run as a Hadoop Streaming job: 29 | hadoop jar /path/to/your/hadoop-streaming-*.jar -input inputpath \ 30 | -mapper cgi-header-mapper.py -file cgi-header-mapper.py \ 31 | -reducer aggregate -output outputpath 32 | 33 | See also https://cloud.google.com/hadoop/ 34 | """ 35 | 36 | import os 37 | import re 38 | import sys 39 | 40 | # Constants 41 | INPUT_FILE_KEY = "map_input_file" 42 | DUPLICATE_GENOME = "gs://pgp-harvard-data-public/hu34D5B9/GS000012763-DID/GS000010327-ASM/GS01173-DNA_C07/ASM/masterVarBeta-GS000010327-ASM.tsv.bz2" 43 | 44 | 45 | def generate_long_count_token(value): 46 | """Formats result for the Hadoop Aggregate package. 47 | 48 | For more detail, see 49 | http://hadoop.apache.org/docs/r1.2.1/streaming.html#Hadoop+Aggregate+Package 50 | 51 | Args: 52 | value: (string) the value to emit 53 | 54 | Returns: 55 | (string) the formatted value 56 | """ 57 | return "LongValueSum:" + value + "\t" + "1" 58 | 59 | 60 | def main(): 61 | """Entry point to the script.""" 62 | 63 | # Basic parsing of command line arguments to allow a filename 64 | # to be passed when running this code in the debugger. 65 | file_handle = sys.stdin 66 | if 2 <= len(sys.argv): 67 | path = sys.argv[1] 68 | file_handle = open(path, "r") 69 | else: 70 | path = os.environ[INPUT_FILE_KEY] 71 | print >> sys.stderr, path 72 | print >> sys.stderr, str(os.environ) 73 | 74 | line = file_handle.readline() 75 | while line: 76 | line = line.rstrip("\n") 77 | 78 | if DUPLICATE_GENOME == path: 79 | # hu34D5B9 was sequenced twice, skip the older genome 80 | pass 81 | elif not line: 82 | # This is a blank line, skip it 83 | pass 84 | elif "#" == line[0]: 85 | # This is a header line, count it 86 | print generate_long_count_token(re.sub("\t", " ", line)) 87 | 88 | line = file_handle.readline() 89 | 90 | if __name__ == "__main__": 91 | main() 92 | -------------------------------------------------------------------------------- /pgp/README.Rmd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | Personal Genomes Project 18 | ================= 19 | 20 | ### Additional Resources 21 | * [Schema](https://bigquery.cloud.google.com/table/google.com:biggene:pgp.variants?pli=1) 22 | * [Provenance](./provenance) 23 | * [Data Stories](./data-stories) such as 24 | * [Comparing PGP to 1000 Genomes](./data-stories/comparing-pgp-to-1000genomes) 25 | * [Issues with the Variant-Centric Approach](./data-stories/issues-with-the-variant-centric-approach) 26 | 27 | 28 | ```{r init, echo=FALSE, message=FALSE, warning=FALSE, comment=NA} 29 | require(bigrquery) 30 | require(ggplot2) 31 | require(dplyr) 32 | require(xtable) 33 | require(testthat) 34 | project <- "google.com:biggene" # put your projectID here 35 | DisplayAndDispatchQuery <- function(queryUri) { 36 | querySql <- readChar(queryUri, nchars=1e6) 37 | cat(querySql) 38 | query_exec(querySql, project) 39 | } 40 | ``` 41 | 42 | **See [PGP Public data](http://googlegenomics.readthedocs.org/en/latest/use_cases/discover_public_data/pgp_public_data.html#bigquery-pgp-tables) for provenance details of the most recent import of the PGP data which has the up-to-date schema.** The other tables you see here comprise a variety of schema experiments. Some of the column names for common data may differ from those of your own variants data exported to BigQuery. 43 | 44 | Here is an initial query joining the variant data with the phenotypic data. See the [phenotypes schema](https://bigquery.cloud.google.com/table/google.com:biggene:pgp.phenotypes?pli=1) for more detail. 45 | 46 | ```{r echo=FALSE, message=FALSE, warning=FALSE, error=FALSE, comment=NA} 47 | result <- DisplayAndDispatchQuery("./sql/gender-count.sql") 48 | ``` 49 | 50 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"} 51 | print(xtable(head(result)), type="html", include.rownames=F) 52 | ``` 53 | 54 | ```{r gender, echo=FALSE, message=FALSE, warning=FALSE, comment=NA, fig.align="center", fig.width=6, fig.height=4} 55 | result$Sex_Gender[is.na(result$Sex_Gender)] <- "Unknown" 56 | ggplot(result, aes(x="", y=cnt, fill=Sex_Gender)) + 57 | geom_bar(width=1, stat="identity") + 58 | coord_polar("y", start=pi / 3) + 59 | xlab("") + ylab("Gender Count") 60 | ``` 61 | -------------------------------------------------------------------------------- /1000genomes/data-stories/reproducing-vcfstats/vcfstats-output/stats.private: -------------------------------------------------------------------------------- 1 | #Private SNPs Sample 2 | 1 HG00106 3 | 1 HG00109 4 | 1 HG00143 5 | 3 HG00152 6 | 1 HG00160 7 | 1 HG00186 8 | 1 HG00231 9 | 1 HG00234 10 | 2 HG00235 11 | 1 HG00236 12 | 2 HG00237 13 | 1 HG00244 14 | 1 HG00246 15 | 1 HG00247 16 | 1 HG00249 17 | 2 HG00329 18 | 1 HG00342 19 | 1 HG00355 20 | 1 HG00367 21 | 2 HG00384 22 | 1 HG00422 23 | 1 HG00442 24 | 1 HG00452 25 | 1 HG00475 26 | 1 HG00534 27 | 1 HG00556 28 | 1 HG00560 29 | 2 HG00611 30 | 1 HG00641 31 | 1 HG00654 32 | 1 HG00656 33 | 1 HG00671 34 | 1 HG00693 35 | 1 HG00699 36 | 1 HG00701 37 | 1 HG00707 38 | 1 HG00708 39 | 3 HG00737 40 | 1 HG00740 41 | 1 HG01048 42 | 1 HG01060 43 | 1 HG01069 44 | 2 HG01108 45 | 2 HG01124 46 | 1 HG01148 47 | 1 HG01149 48 | 1 HG01171 49 | 1 HG01191 50 | 1 HG01272 51 | 1 HG01356 52 | 1 HG01375 53 | 1 HG01378 54 | 3 HG01390 55 | 1 HG01456 56 | 1 HG01462 57 | 1 HG01465 58 | 1 HG01488 59 | 1 HG01489 60 | 1 HG01491 61 | 1 HG01495 62 | 8 HG01551 63 | 1 HG01624 64 | 1 NA07051 65 | 1 NA12342 66 | 2 NA12383 67 | 2 NA12400 68 | 1 NA18507 69 | 2 NA18510 70 | 1 NA18519 71 | 1 NA18523 72 | 1 NA18527 73 | 1 NA18528 74 | 1 NA18532 75 | 1 NA18534 76 | 2 NA18535 77 | 1 NA18536 78 | 1 NA18539 79 | 1 NA18548 80 | 1 NA18557 81 | 2 NA18562 82 | 2 NA18573 83 | 1 NA18596 84 | 1 NA18605 85 | 1 NA18606 86 | 3 NA18616 87 | 1 NA18622 88 | 1 NA18628 89 | 1 NA18631 90 | 1 NA18634 91 | 1 NA18638 92 | 1 NA18641 93 | 1 NA18856 94 | 3 NA18868 95 | 1 NA18907 96 | 1 NA18924 97 | 2 NA18939 98 | 3 NA18956 99 | 1 NA18957 100 | 2 NA18962 101 | 1 NA18976 102 | 1 NA18990 103 | 1 NA18992 104 | 1 NA18995 105 | 1 NA19002 106 | 1 NA19005 107 | 1 NA19020 108 | 1 NA19046 109 | 1 NA19056 110 | 1 NA19059 111 | 2 NA19063 112 | 1 NA19068 113 | 1 NA19074 114 | 2 NA19077 115 | 1 NA19084 116 | 2 NA19087 117 | 1 NA19093 118 | 14 NA19096 119 | 1 NA19099 120 | 1 NA19131 121 | 1 NA19147 122 | 1 NA19149 123 | 1 NA19150 124 | 1 NA19197 125 | 1 NA19236 126 | 1 NA19248 127 | 1 NA19316 128 | 1 NA19318 129 | 1 NA19319 130 | 1 NA19324 131 | 1 NA19332 132 | 2 NA19346 133 | 1 NA19351 134 | 1 NA19360 135 | 1 NA19372 136 | 1 NA19395 137 | 1 NA19398 138 | 1 NA19401 139 | 1 NA19437 140 | 1 NA19439 141 | 1 NA19440 142 | 1 NA19457 143 | 1 NA19463 144 | 1 NA19467 145 | 1 NA19474 146 | 1 NA19661 147 | 1 NA19701 148 | 2 NA19704 149 | 1 NA19716 150 | 1 NA19717 151 | 1 NA19719 152 | 1 NA19734 153 | 1 NA19740 154 | 1 NA19749 155 | 1 NA19752 156 | 1 NA19755 157 | 1 NA19758 158 | 1 NA19761 159 | 1 NA19762 160 | 1 NA19774 161 | 2 NA19780 162 | 1 NA19782 163 | 1 NA19819 164 | 1 NA19901 165 | 1 NA19904 166 | 1 NA19921 167 | 1 NA20294 168 | 1 NA20296 169 | 2 NA20322 170 | 1 NA20342 171 | 1 NA20344 172 | 1 NA20351 173 | 1 NA20505 174 | 1 NA20506 175 | 2 NA20521 176 | 1 NA20581 177 | 1 NA20582 178 | 1 NA20589 179 | 1 NA20756 180 | 1 NA20760 181 | 1 NA20768 182 | 1 NA20792 183 | 1 NA20796 184 | 1 NA20803 185 | 1 NA20805 186 | 1 NA20809 187 | 2 NA20819 188 | 1 NA20826 189 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | How to contribute 2 | =================================== 3 | 4 | First of all, thank you for contributing! 5 | 6 | The mailing list 7 | ---------------- 8 | 9 | For general questions or if you are having trouble getting started, try the 10 | `Google Genomics Discuss mailing list `_. 11 | It's a good way to sync up with other people who use googlegenomics including the core developers. You can subscribe 12 | by sending an email to ``google-genomics-discuss+subscribe@googlegroups.com`` or just post using 13 | the `web forum page `_. 14 | 15 | 16 | Submitting issues 17 | ----------------- 18 | 19 | If you are encountering a bug in the code or have a feature request in mind - file away! 20 | 21 | 22 | Submitting a pull request 23 | ------------------------- 24 | 25 | If you are ready to contribute code, Github provides a nice `overview on how to create a pull request 26 | `_. 27 | 28 | Some general rules to follow: 29 | 30 | * Do your work in `a fork `_ of this repo. 31 | * Create a branch for each update that you're working on. 32 | These branches are often called "feature" or "topic" branches. Any changes 33 | that you push to your feature branch will automatically be shown in the pull request. 34 | * Keep your pull requests as small as possible. Large pull requests are hard to review. 35 | Try to break up your changes into self-contained and incremental pull requests. 36 | * The first line of commit messages should be a short (<80 character) summary, 37 | followed by an empty line and then any details that you want to share about the commit. 38 | * Please try to follow the existing syntax style 39 | 40 | When you submit or change your pull request, the Travis build system will automatically run tests. 41 | If your pull request fails to pass tests, review the test log, make changes and 42 | then push them to your feature branch to be tested again. 43 | 44 | 45 | Contributor License Agreements 46 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 47 | 48 | All pull requests are welcome. Before we can submit them though, there is a legal hurdle we have to jump. 49 | You'll need to fill out either the individual or corporate Contributor License Agreement 50 | (CLA). 51 | 52 | * If you are an individual writing original source code and you're sure you 53 | own the intellectual property, then you'll need to sign an `individual CLA 54 | `_. 55 | * If you work for a company that wants to allow you to contribute your work, 56 | then you'll need to sign a `corporate CLA 57 | `_. 58 | 59 | Follow either of the two links above to access the appropriate CLA and 60 | instructions for how to sign and return it. Once we receive it, we'll be able to 61 | accept your pull requests. 62 | -------------------------------------------------------------------------------- /pgp/data-stories/schema-comparisons/schema-comparison-observations.csv: -------------------------------------------------------------------------------- 1 | tables,table_size,description,code,runtime,data_processed,notes 2 | cgi_variants,433GB,sample-level data for a particular variant,klotho.sql,4.1s elapsed,117 GB processed, 3 | gvcf_variants,235GB,sample-level data for a particular variant,klotho.sql,6.9s elapsed,76.8 GB processed, 4 | gvcf_variants_expanded,506GB,sample-level data for a particular variant,klotho.sql,7.7s elapsed,196 GB processed, 5 | cgi_variants,433GB,per sample Ti/Tv ratio,ti-tv-ratio.sql,3.8s elapsed,53.7 GB processed, 6 | gvcf_variants,235GB,per sample Ti/Tv ratio,ti-tv-ratio.sql,29.4s elapsed,59.8 GB processed, 7 | gvcf_variants_expanded,506GB,per sample Ti/Tv ratio,ti-tv-ratio.sql,83.5s elapsed,185 GB processed, 8 | cgi_variants,433GB,allelic frequency on a small region of the genome,allelic-frequency-brca1.sql,50.3s elapsed,117 GB processed, 9 | gvcf_variants,235GB,allelic frequency on a small region of the genome,allelic-frequency-brca1.sql,15.7s elapsed,53.9 GB processed, 10 | gvcf_variants_expanded,506GB,allelic frequency on a small region of the genome,NA,NA,NA,"the pattern is correct but the result will be wrong until records for ""the same"" variant are merged together" 11 | cgi_variants,433GB,"allele counts for the full dataset, as step one out of two to compute allelic frequency for the full dataset",allele-count.sql,"70.2s elapsed 12 | ","88.8 GB processed 13 | 14 | ",result materialized to table google.com:biggene:pgp_analysis_results.cgi_variants_allele_counts 15 | gvcf_variants,235GB,"allele counts for the full dataset, as step one out of two to compute allelic frequency for the full dataset",allele-count.sql,51.6s elapsed,44.1 GB processed,result materialized to table google.com:biggene:pgp_analysis_results.gvcf_variants_allele_counts 16 | gvcf_variants_expanded,506GB,"allele counts for the full dataset, as step one out of two to compute allelic frequency for the full dataset",NA,NA,NA,"not necessary, the data encoding allows us to do allelic frequency in a single step" 17 | cgi_variants,433GB,allelic frequency as step two of two,allelic-frequency-chr1.sql,118.4s elapsed,90.4 GB processed,results for all chromosomes materialized to table google.com:biggene:pgp_analysis_results.cgi_variants_allelic_frequency 18 | gvcf_variants,235GB,allelic frequency as step two of two,allelic-frequency-chr1.sql,96.4s elapsed,55.2 GB processed,results for all chromosomes materialized to table google.com:biggene:pgp_analysis_results.gvcf_variants_allelic_frequency 19 | gvcf_variants_expanded,506GB,allelic frequency,allelic-frequency.sql,318.7s elapsed,121 GB processed,"the pattern is correct but the result will be wrong until records for ""the same"" variant are merged together" 20 | cgi_variants,433GB,"allelic frequency compared to 1,000 genomes",allelic-frequency-comparison.sql,20.7s elapsed,2.93 GB processed, 21 | gvcf_variants,235GB,"allelic frequency compared to 1,000 genomes",allelic-frequency-comparison.sql,12.5s elapsed,2.72 GB processed, 22 | gvcf_variants_expanded,506GB,"allelic frequency compared to 1,000 genomes",NA,NA,NA,"the pattern is correct but the result will be wrong until records for ""the same"" variant are merged together" -------------------------------------------------------------------------------- /1000genomes/provenance/README.md: -------------------------------------------------------------------------------- 1 | Provenance 2 | ======================================================== 3 | 4 | Source Variant Data 5 | ------------------------------ 6 | 7 | ### variants table 8 | 9 | See [Google Genomics Public Data](http://googlegenomics.readthedocs.io/en/latest/use_cases/discover_public_data/1000_genomes.html) for provenance details for this data. 10 | 11 | Source Sample Information 12 | -------------------------------- 13 | [Ethnicity, gender, and family relationship](http://www.1000genomes.org/faq/can-i-get-phenotype-gender-and-family-relationship-information-samples) information is available for the 1,000 Genomes dataset. Super population groupings are described in the [FAQ](http://www.1000genomes.org/category/frequently-asked-questions/population). 14 | 15 | Note: information for sample NA12236 is present in the pedigree table but not sample_info table. Also sample NA12236 is not a member of the samples within table variants1kg. 16 | 17 | ### sample_info table 18 | 19 | Description: 20 | * http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/README_20130606_sample_info 21 | * http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/README.populations 22 | * [BigQuery table](https://bigquery.cloud.google.com/table/genomics-public-data:1000_genomes.sample_info?pli=1) 23 | 24 | Source: 25 | * http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/20130606_sample_info.txt 26 | * http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/20131219.populations.tsv 27 | * http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/20131219.superpopulations.tsv 28 | 29 | Status: 30 | * complete, see script [sample-info-prep.R](./sample-info-prep.R) to see how the data was cleaned and transformed prior to the upload to BigQuery 31 | 32 | To load the script output via the [bq command line tool](https://cloud.google.com/bigquery/bq-command-line-tool#creatingtablefromfile), run: 33 | ``` 34 | bq load --project_id --source_format=CSV \ 35 | --skip_leading_rows=1 \ 36 | gs://genomics-public-data/1000-genomes/other/sample_info/sample_info.csv \ 37 | gs://genomics-public-data/1000-genomes/other/sample_info/sample_info.schema 38 | ``` 39 | 40 | ### pedigree table 41 | 42 | Description: 43 | * http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/README_20130606_sample_info 44 | * [BigQuery table](https://bigquery.cloud.google.com/table/genomics-public-data:1000_genomes.pedigree?pli=1) 45 | 46 | Source: 47 | * http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/20130606_g1k.ped 48 | 49 | Status: 50 | * complete, no cleaning or transformation needed 51 | 52 | To load the source file via the [bq command line tool](https://cloud.google.com/bigquery/bq-command-line-tool#creatingtablefromfile), download it to your local system and run: 53 | ``` 54 | bq load --project_id --source_format=CSV \ 55 | --field_delimiter=tab --skip_leading_rows=1 \ 56 | ./20130606_g1k.ped \ 57 | Family_ID:STRING,Individual_ID:STRING,Paternal_ID:STRING,Maternal_ID:STRING,Gender:INTEGER,Phenotype:INTEGER,Population:STRING,Relationship:STRING,Siblings:STRING,Second_Order:STRING,Third_Order:STRING,Other_Comments:STRING 58 | ``` 59 | -------------------------------------------------------------------------------- /pgp/sql/cgi_variants/allelic-frequency-chr1.sql: -------------------------------------------------------------------------------- 1 | # Compute allelic frequency for chromosome 1 by counting the number of called 2 | # alleles (reference-calls and variant-calls, but leave out no-calls) that 3 | # overlap each variant allele for which we previously counted its occurence 4 | # in this dataset. This returns a large result which should be materialized to 5 | # a table. 6 | # 7 | # Note that the new BigQuery feature of user-defined javascript 8 | # functions is in limited preview. For more info, see 9 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377 10 | SELECT 11 | vars.chromosome AS chromosome, 12 | vars.reference AS reference, 13 | vars.locusBegin AS locusBegin, 14 | vars.locusEnd AS locusEnd, 15 | vars.allele AS allele, 16 | alternate_allele_count, 17 | num_alleles_called, 18 | ROUND(alternate_allele_count / num_alleles_called, 19 | 4) AS freq, 20 | FROM ( 21 | SELECT 22 | vars.chromosome, 23 | vars.reference, 24 | vars.locusBegin, 25 | vars.locusEnd, 26 | vars.allele, 27 | alternate_allele_count, 28 | SUM(num_alleles_called) AS num_alleles_called, 29 | FROM ( 30 | # The left hand side of our JOIN is are all the calls, including 31 | # reference calls (but not no-calls) 32 | SELECT 33 | SUM(num_alleles_called) AS num_alleles_called, 34 | chromosome, 35 | reference, 36 | bin, 37 | locusBegin, 38 | locusEnd 39 | # This User-defined function helps us reduce the size of the cross product 40 | # considered by this JOIN thereby greatly speeding up the query 41 | FROM js( 42 | (SELECT chromosome, reference, locusBegin, locusEnd, allele1Seq, allele2Seq, 43 | FROM [google.com:biggene:pgp.cgi_variants] 44 | WHERE chromosome = 'chr1'), 45 | chromosome, reference, locusBegin, locusEnd, allele1Seq, allele2Seq, 46 | "[{name: 'num_alleles_called', type: 'integer'}, 47 | {name: 'chromosome', type: 'string'}, 48 | {name: 'reference', type: 'string'}, 49 | {name: 'bin', type: 'integer'}, 50 | {name: 'locusBegin', type: 'integer'}, 51 | {name: 'locusEnd', type: 'integer'}]", 52 | "function(r, emit) { 53 | var num_alleles_called = 0; 54 | if('?' != r.allele1Seq) { num_alleles_called++; } 55 | if('?' != r.allele2Seq) { num_alleles_called++; } 56 | var binSize = 5000 57 | var startBin = Math.floor(r.locusBegin / binSize); 58 | var endBin = Math.floor(r.locusEnd / binSize); 59 | for(var bin = startBin; bin <= endBin; bin++) { 60 | emit({ 61 | num_alleles_called: num_alleles_called, 62 | chromosome: r.chromosome, 63 | reference: r.reference, 64 | bin: bin, 65 | locusBegin: r.locusBegin, 66 | locusEnd: r.locusEnd, 67 | }); 68 | } 69 | }") 70 | GROUP EACH BY 71 | chromosome, 72 | reference, 73 | bin, 74 | locusBegin, 75 | locusEnd 76 | ) AS all 77 | JOIN 78 | EACH 79 | # The right hand side of our JOIN are counts of alternate allele values at 80 | # a particular locus 81 | [google.com:biggene:pgp_analysis_results.cgi_variants_allele_counts] AS vars 82 | ON 83 | vars.chromosome = all.chromosome 84 | AND vars.bin = all.bin 85 | WHERE 86 | # Further constrain the JOIN to calls that overlapped the first base pair 87 | # of this variant 88 | all.locusBegin <= vars.locusBegin 89 | AND all.locusEnd >= vars.locusBegin+1 90 | GROUP EACH BY 91 | vars.chromosome, 92 | vars.reference, 93 | vars.locusBegin, 94 | vars.locusEnd, 95 | vars.allele, 96 | alternate_allele_count 97 | ) 98 | -------------------------------------------------------------------------------- /pgp/provenance/cgi-mapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2014 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """Add sample id as column to CGI data. 18 | 19 | Assumptions: 20 | - one sample per input file 21 | 22 | This script can be run standalone: 23 | cat masterVarBeta-GS000010426-ASM.tsv | ./cgi-mapper.py 24 | 25 | Or via the debugger: 26 | python -mpdb ./cgi-mapper.py masterVarBeta-GS000010426-ASM.tsv 27 | 28 | To have the sample id correctly parsed when input is from stdin, set the 29 | environment variable that Hadoop would set: 30 | export map_input_file=./hu34D5B9/masterVarBeta-GS000015891-ASM.tsv.bz2 31 | bzcat ./hu34D5B9/masterVarBeta-GS000015891-ASM.tsv.bz2 | ./cgi-mapper.py 32 | 33 | To have the sample id correctly parsed when input is from a file, ensure that it 34 | is in the file path: 35 | python -mpdb ./cgi-mapper.py hu34D5B9/masterVarBeta-GS000015891-ASM.tsv 36 | 37 | It can also be run as a mapper-only Hadoop Streaming job: 38 | hadoop jar /path/to/your/hadoop-streaming-*.jar -input inputpath \ 39 | -mapper cgi-mapper.py -file cgi-mapper.py --numReduceTasks 0 \ 40 | -output outputpath 41 | See also https://cloud.google.com/hadoop/ 42 | 43 | TODO(deflaux): 44 | - field relativeCoverageDiploid contains some values that are 'N', consider 45 | converting those values to null 46 | - consider converting zero-based positions to one-based positions if we 47 | find that most annotations are one-based 48 | 49 | """ 50 | 51 | import os 52 | import re 53 | import sys 54 | 55 | # Constants 56 | INPUT_FILE_KEY = "map_input_file" 57 | SAMPLE_ID_PATTERN = "/(hu[A-F0-9]{6})/" 58 | DUPLICATE_GENOME = "gs://pgp-harvard-data-public/hu34D5B9/GS000012763-DID/GS000010327-ASM/GS01173-DNA_C07/ASM/masterVarBeta-GS000010327-ASM.tsv.bz2" 59 | 60 | 61 | def main(): 62 | """Entry point to the script.""" 63 | 64 | sample_id = None 65 | sample_id_re = re.compile(SAMPLE_ID_PATTERN) 66 | 67 | # Basic parsing of command line arguments to allow a filename 68 | # to be passed when running this code in the debugger. 69 | path = None 70 | file_handle = sys.stdin 71 | if 2 <= len(sys.argv): 72 | path = sys.argv[1] 73 | file_handle = open(path, "r") 74 | elif INPUT_FILE_KEY in os.environ: 75 | path = os.environ[INPUT_FILE_KEY] 76 | print >> sys.stderr, path 77 | print >> sys.stderr, str(os.environ) 78 | 79 | if path is not None: 80 | match = sample_id_re.search(path) 81 | if match: 82 | sample_id = match.group(1) 83 | 84 | line = file_handle.readline() 85 | while line: 86 | line = line.rstrip("\n") 87 | 88 | if DUPLICATE_GENOME == path: 89 | # hu34D5B9 was sequenced twice, skip the older genome 90 | pass 91 | elif not line: 92 | # This is a blank line, skip it 93 | pass 94 | elif "#" == line[0]: 95 | # This is a header line, skip it 96 | pass 97 | elif ">" == line[0]: 98 | # This is the column header line, skip it 99 | pass 100 | else: 101 | fields = line.split("\t") 102 | print "%s\t%s" % (sample_id, "\t".join(fields)) 103 | 104 | line = file_handle.readline() 105 | 106 | if __name__ == "__main__": 107 | main() 108 | -------------------------------------------------------------------------------- /1000genomes/sql/gwas-pattern-two-proportion-z-test.sql: -------------------------------------------------------------------------------- 1 | # An example of a pattern one might use for GWAS queries upon 1,000 2 | # Genomes variants. It is specifically examining differences allelic 3 | # frequency for variants upon chromosome 12 between the EAS super 4 | # population versus all other individuals, returning a ranked list of 5 | # variants by decreasing variation between groups. Note that this 6 | # particular query below is naive in many, many respects and is merely 7 | # meant as an over-simplified example that might help domain experts 8 | # translate their scientifically correct data filtering and 9 | # statistical methods to BigQuery. Feedback to improve this query is 10 | # most welcome! 11 | 12 | # http://www.statisticslectures.com/topics/ztestproportions/ 13 | # two-proportion z-test 14 | # z-score critical value for p-value=5*10^-8 is +/-5.45131 15 | # 16 | # > qnorm(1 - ((5e-8)/2), lower.tail=T) 17 | # [1] 5.45131 18 | # > qnorm(1 - ((5e-8)/2), lower.tail=F) 19 | # [1] -5.45131 20 | 21 | SELECT 22 | reference_name, 23 | start, 24 | END, 25 | reference_bases, 26 | alternate_bases, 27 | vt, 28 | case_count, 29 | control_count, 30 | allele_count, 31 | ref_count, 32 | alt_count, 33 | case_ref_count, 34 | case_alt_count, 35 | control_ref_count, 36 | control_alt_count, 37 | ROUND( 38 | (case_alt_count/case_count - control_alt_count/control_count) 39 | / 40 | SQRT( 41 | ((((case_alt_count+control_alt_count)/allele_count) * 42 | ((case_ref_count+control_ref_count)/allele_count)) 43 | / case_count 44 | ) 45 | + 46 | ((((case_alt_count+control_alt_count)/allele_count) * 47 | ((case_ref_count+control_ref_count)/allele_count)) 48 | / control_count 49 | ) 50 | ) 51 | , 52 | 3) 53 | AS z_score 54 | FROM ( 55 | SELECT 56 | reference_name, 57 | start, 58 | end, 59 | reference_bases, 60 | alternate_bases, 61 | vt, 62 | SUM(ref_count + alt_count) AS allele_count, 63 | SUM(ref_count) AS ref_count, 64 | SUM(alt_count) AS alt_count, 65 | SUM(IF(TRUE = is_case, INTEGER(ref_count + alt_count), 0)) AS case_count, 66 | SUM(IF(FALSE = is_case, INTEGER(ref_count + alt_count), 0)) AS control_count, 67 | SUM(IF(TRUE = is_case, ref_count, 0)) AS case_ref_count, 68 | SUM(IF(TRUE = is_case, alt_count, 0)) AS case_alt_count, 69 | SUM(IF(FALSE = is_case, ref_count, 0)) AS control_ref_count, 70 | SUM(IF(FALSE = is_case, alt_count, 0)) AS control_alt_count, 71 | FROM ( 72 | SELECT 73 | reference_name, 74 | start, 75 | ('EAS' = super_population) AS is_case, 76 | reference_bases, 77 | alternate_bases, 78 | END, 79 | vt, 80 | # 1000 genomes phase 1 data is bi-allelic so there is only ever a single alt 81 | SUM(0 = call.genotype) WITHIN RECORD AS ref_count, 82 | SUM(1 = call.genotype) WITHIN RECORD AS alt_count, 83 | FROM 84 | FLATTEN(( 85 | SELECT 86 | reference_name, 87 | start, 88 | reference_bases, 89 | alternate_bases, 90 | END, 91 | vt, 92 | call.call_set_name, 93 | call.genotype, 94 | FROM 95 | [genomics-public-data:1000_genomes.variants] 96 | WHERE 97 | reference_name = '12' 98 | ), 99 | call) AS g 100 | JOIN 101 | [genomics-public-data:1000_genomes.sample_info] p 102 | ON 103 | g.call.call_set_name = p.sample 104 | ) 105 | GROUP BY 106 | reference_name, 107 | start, 108 | end, 109 | reference_bases, 110 | alternate_bases, 111 | vt) 112 | HAVING 113 | z_score >= 5.45131 114 | OR z_score <= -5.45131 115 | ORDER BY 116 | z_score DESC, 117 | allele_count DESC 118 | -------------------------------------------------------------------------------- /platinumGenomes/README.Rmd: -------------------------------------------------------------------------------- 1 | Platinum Genomes 2 | ================ 3 | 4 | ### Additional Resources 5 | 6 | There are just a handful of queries below but you will find a whole suite of 7 | queries for the Platinum Genome dataset written as a codelab for performing 8 | [Quality Control on Variants](https://github.com/googlegenomics/codelabs/tree/master/R/PlatinumGenomes-QC). 9 | 10 | * [variants table](https://bigquery.cloud.google.com/table/genomics-public-data:platinum_genomes.variants?pli=1) 11 | * [sample_info table](https://bigquery.cloud.google.com/table/google.com:biggene:platinum_genomes.sample_info) 12 | * See [Google Genomics Public Data](http://googlegenomics.readthedocs.io/en/latest/use_cases/discover_public_data/platinum_genomes.html) 13 | for provenance details for this data. 14 | 15 | ```{r echo=FALSE, eval=FALSE} 16 | ######################[ CHANGE ME ]################################## 17 | # This codelab assumes that the current working directory is where the Rmd file resides. 18 | setwd("/YOUR/PATH/TO/bigquery-examples/platinumGenomes") 19 | 20 | # Set the Google Cloud Platform project id under which these queries will run. 21 | project <- "YOUR-PROJECT-ID" 22 | ##################################################################### 23 | 24 | ### Install the bigrquery package. The currently released version 0.3.0 does not yet 25 | ### have the parameter to use Standard SQL instead of Legacy SQL, so we install from github. 26 | library(devtools) 27 | install_github('rstats-db/bigrquery') 28 | ``` 29 | 30 | ```{r init, echo=FALSE, message=FALSE, warning=FALSE, comment=NA} 31 | library(bigrquery) 32 | library(ggplot2) 33 | library(scales) 34 | library(dplyr) 35 | library(testthat) 36 | DisplayAndDispatchQuery <- function(queryUri) { 37 | querySql <- readChar(queryUri, nchars=1e6) 38 | cat(querySql) 39 | query_exec(querySql, project, use_legacy_sql = FALSE) 40 | } 41 | ``` 42 | 43 | ### SNP Annotation 44 | 45 | Let's annotate variants in the [Illumina Platinum Genomes dataset](http://googlegenomics.readthedocs.io/en/latest/use_cases/discover_public_data/platinum_genomes.html) 46 | using Tute Genomics' table of annotations for hg19 SNPs. Please see [Google Genomics Public Data](http://googlegenomics.readthedocs.io/en/latest/use_cases/discover_public_data/tute_genomics_public_data.html) 47 | for more detail about these annotations. 48 | 49 | First we'll count variants by exonic functional impact: 50 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, cache=FALSE} 51 | result <- DisplayAndDispatchQuery("./sql/sample-snps-by-exonic-function.sql") 52 | ``` 53 | 54 | Results: 55 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"} 56 | knitr::kable(head(result), digits=6) 57 | ``` 58 | 59 | Visualized: 60 | ```{r function, echo=FALSE, message=FALSE, warning=FALSE, comment=NA, fig.align="center", fig.width=12, fig.height=8} 61 | ggplot(result, aes(y=variant_count, x=ExonicFunc)) + 62 | geom_boxplot() + 63 | scale_y_log10(labels=comma) + 64 | ylab("Number of variants (log scale)") + 65 | xlab("Exonic Function") + 66 | ggtitle("Functional impact of Platinum Genomes SNPs") + 67 | theme(axis.text.x=element_text(angle=50, hjust=1)) 68 | ``` 69 | 70 | Next we'll identify rare variants across the cohort indicated as pathenogenic 71 | by [ClinVar](https://www.ncbi.nlm.nih.gov/clinvar/): 72 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, cache=FALSE} 73 | result <- DisplayAndDispatchQuery("./sql/cohort-rare-pathenogenic-snps.sql") 74 | ``` 75 | 76 | Results: 77 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"} 78 | knitr::kable(result, digits=6) 79 | ``` 80 | 81 | And finally we'll re-run this analysis using only the variants for one specific individual: 82 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, cache=FALSE} 83 | result <- DisplayAndDispatchQuery("./sql/sample-rare-pathenogenic-snps.sql") 84 | ``` 85 | 86 | Results: 87 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"} 88 | knitr::kable(result, digits=6) 89 | ``` 90 | -------------------------------------------------------------------------------- /pgp/sql/gvcf_variants/allelic-frequency-chr1.sql: -------------------------------------------------------------------------------- 1 | # Compute allelic frequency for chromosome 1 by counting the number of called 2 | # alleles (reference-calls and variant-calls, but leave out no-calls) that 3 | # overlap each variant allele for which we previously counted its occurence 4 | # in this dataset. This returns a large result which should be materialized to 5 | # a table. 6 | # 7 | # Note that the new BigQuery feature of user-defined javascript 8 | # functions is in limited preview. For more info, see 9 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377 10 | SELECT 11 | vars.contig_name AS contig_name, 12 | vars.reference_bases AS reference_bases, 13 | vars.start_pos AS start_pos, 14 | vars.alternate_bases AS allele, 15 | alternate_allele_count, 16 | num_alleles_called, 17 | ROUND(alternate_allele_count / num_alleles_called, 18 | 4) AS freq, 19 | FROM ( 20 | SELECT 21 | vars.contig_name, 22 | vars.reference_bases, 23 | vars.start_pos, 24 | vars.alternate_bases, 25 | alternate_allele_count, 26 | SUM(num_alleles_called) AS num_alleles_called, 27 | FROM ( 28 | # The left hand side of our JOIN is are all the calls, including 29 | # reference_bases calls and no-calls 30 | SELECT 31 | SUM(num_alleles_called) AS num_alleles_called, 32 | contig_name, 33 | reference_bases, 34 | bin, 35 | start_pos, 36 | the_end, 37 | # This User-defined function helps us reduce the size of the cross product 38 | # considered by this JOIN thereby greatly speeding up the query 39 | FROM js( 40 | (SELECT contig_name, reference_bases, start_pos, end_pos, END, call.genotype, 41 | FROM [google.com:biggene:pgp.gvcf_variants] 42 | WHERE contig_name = '1'), 43 | contig_name, reference_bases, start_pos, end_pos, END, call.genotype, 44 | "[{name: 'num_alleles_called', type: 'integer'}, 45 | {name: 'contig_name', type: 'string'}, 46 | {name: 'reference_bases', type: 'string'}, 47 | {name: 'bin', type: 'integer'}, 48 | {name: 'start_pos', type: 'integer'}, 49 | {name: 'the_end', type: 'integer'}]", 50 | "function(r, emit) { 51 | var num_alleles_called = 0; 52 | for(var c in r.call) { 53 | for(var g in r.call[c].genotype) { 54 | if(0 <= r.call[c].genotype[g]) { 55 | num_alleles_called++; 56 | } 57 | } 58 | } 59 | var binSize = 5000 60 | var startBin = Math.floor(r.start_pos / binSize); 61 | var theEnd = (r.END === null) ? r.end_pos : r.END; 62 | var endBin = Math.floor(theEnd / binSize); 63 | for(var bin = startBin; bin <= endBin; bin++) { 64 | emit({ 65 | num_alleles_called: num_alleles_called, 66 | contig_name: r.contig_name, 67 | reference_bases: r.reference_bases, 68 | bin: bin, 69 | start_pos: r.start_pos, 70 | the_end: theEnd 71 | }); 72 | } 73 | }") 74 | GROUP EACH BY 75 | contig_name, 76 | reference_bases, 77 | bin, 78 | start_pos, 79 | the_end 80 | ) AS all 81 | JOIN 82 | EACH 83 | # The right hand side of our JOIN are counts of alternate allele values at 84 | # a particular locus 85 | [google.com:biggene:pgp_analysis_results.gvcf_variants_allele_counts] AS vars 86 | ON 87 | vars.contig_name = all.contig_name 88 | AND vars.bin = all.bin 89 | WHERE 90 | # Further constrain the JOIN to calls that overlapped the first base pair 91 | # of this variant 92 | all.start_pos <= vars.start_pos 93 | AND all.the_end >= vars.start_pos+1 94 | GROUP EACH BY 95 | vars.contig_name, 96 | vars.reference_bases, 97 | vars.start_pos, 98 | vars.alternate_bases, 99 | alternate_allele_count 100 | ) 101 | -------------------------------------------------------------------------------- /sgdp/provenance/wrangle-simons-sample-attributes.R: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Wrangle sample attributes for Simons Genome Diversity Project Data. See also: 16 | # https://www.simonsfoundation.org/life-sciences/simons-genome-diversity-project-dataset/ 17 | # http://googlegenomics.readthedocs.io/en/latest/use_cases/discover_public_data/simons_foundation.html 18 | 19 | library(testthat) 20 | library(XML) 21 | library(reshape2) 22 | library(dplyr) 23 | library(stringr) 24 | 25 | study <- read.delim("https://www.ebi.ac.uk/ena/data/warehouse/filereport?accession=PRJEB9586&result=read_run&fields=study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,tax_id,scientific_name,instrument_model,library_layout,fastq_ftp,fastq_galaxy,submitted_ftp,submitted_galaxy,sra_ftp,sra_galaxy,cram_index_ftp,cram_index_galaxy&download=txt", 26 | header=T) 27 | 28 | accessions <- unique(study$sample_accession) 29 | expect_that(length(accessions), equals(279)) 30 | 31 | simons_attributes <- function(x) { 32 | raw <- xmlParse(paste0("http://www.ebi.ac.uk/ena/data/view/", x, "%26display%3Dxml")) 33 | parsed <- xpathApply(raw, "//ROOT/SAMPLE/SAMPLE_ATTRIBUTES", xmlToDataFrame) 34 | long <- mutate(parsed[[1]], 35 | era_id=x) 36 | } 37 | 38 | all <- do.call(rbind, lapply(accessions, simons_attributes)) 39 | expect_that(ncol(all), equals(3)) 40 | 41 | # Fix attribute name "Sex" versus "sex" by lower casing all attribute names. 42 | all$TAG = tolower(all$TAG) 43 | 44 | # Fix attribute name "geographic location (country and/or sea)" versus "country". 45 | all$TAG = gsub("geographic location (country and/or sea)", 46 | "country", 47 | all$TAG, 48 | fixed=TRUE) 49 | 50 | # Reshape long attribute list into wide format. 51 | wide <- reshape(all, idvar = "era_id", timevar="TAG", direction = "wide") 52 | expect_that(nrow(wide), equals(279)) 53 | 54 | # Tidy up the column names. 55 | colnames(wide) <- gsub("VALUE.", "", colnames(wide)) 56 | colnames(wide) <- gsub("-", "_", colnames(wide)) 57 | colnames(wide) <- gsub(" ", "_", colnames(wide)) 58 | 59 | # In two cases the library name instead of the Illumina ID is what ended up in the VCF file. 60 | # SS6004478 == LP6005442-DNA_A09 per http://www.ebi.ac.uk/ena/data/view/SAMEA3302719 61 | # SS6004477 == LP6005442-DNA_B09 per http://www.ebi.ac.uk/ena/data/view/SAMEA3302681 62 | 63 | # There is one final case where the Illumina ID does not match the id in any VCF, so by 64 | # process of elimination, remapping that one too. 65 | # LP6005443-DNA_C01 == LP6005441-DNA_A09 per process of elimination 66 | 67 | # Add a new column holding repaired values. 68 | wide_remapped = mutate(wide, 69 | id_from_vcf=illumina_id) 70 | wide_remapped$id_from_vcf = gsub("LP6005442-DNA_A09", 71 | "SS6004478", 72 | wide_remapped$id_from_vcf, 73 | fixed=TRUE) 74 | wide_remapped$id_from_vcf = gsub("LP6005442-DNA_B09", 75 | "SS6004477", 76 | wide_remapped$id_from_vcf, 77 | fixed=TRUE) 78 | wide_remapped$id_from_vcf = gsub("LP6005441-DNA_A09", 79 | "LP6005443-DNA_C01", 80 | wide_remapped$id_from_vcf, 81 | fixed=TRUE) 82 | 83 | write.csv(wide_remapped, "simons-sample-attributes.csv", row.names=FALSE, na="") 84 | 85 | # Then load the resulting file to BigQuery via: 86 | # bq load --autodetect THE_DATASET.THE_TABLE simons-sample-attributes.csv 87 | -------------------------------------------------------------------------------- /1000genomes/sql/gwas-pattern-chi-squared-test.sql: -------------------------------------------------------------------------------- 1 | # An example of a pattern one might use for GWAS queries upon 1,000 2 | # Genomes variants. It is specifically examining differences allelic 3 | # frequency for variants upon chromosome 12 between the EAS super 4 | # population versus all other individuals, returning a ranked list of 5 | # variants by decreasing variation between groups. Note that this 6 | # particular query below is naive in many, many respects and is merely 7 | # meant as an over-simplified example that might help domain experts 8 | # translate their scientifically correct data filtering and 9 | # statistical methods to BigQuery. Feedback to improve this query is 10 | # most welcome! 11 | 12 | # http://www.statisticslectures.com/topics/goodnessoffit/ 13 | # http://homes.cs.washington.edu/~suinlee/genome560/lecture7.pdf 14 | # http://bioinformatics.ca/files/Statistics/Statistics_Day2-Module8.pdf 15 | # Chi-squared critical value for df=1, p-value=5*10^-8 is 29.71679 16 | # > qchisq(1 - 5e-08, df=1) 17 | # [1] 29.71679 18 | 19 | SELECT 20 | reference_name, 21 | start, 22 | end, 23 | reference_bases, 24 | alternate_bases, 25 | vt, 26 | case_count, 27 | control_count, 28 | allele_count, 29 | ref_count, 30 | alt_count, 31 | case_ref_count, 32 | case_alt_count, 33 | control_ref_count, 34 | control_alt_count, 35 | # https://en.wikipedia.org/wiki/Yates%27s_correction_for_continuity 36 | ROUND( 37 | POW(ABS(case_ref_count - (ref_count/allele_count)*case_count) - 0.5, 38 | 2)/((ref_count/allele_count)*case_count) + 39 | POW(ABS(control_ref_count - (ref_count/allele_count)*control_count) - 0.5, 40 | 2)/((ref_count/allele_count)*control_count) + 41 | POW(ABS(case_alt_count - (alt_count/allele_count)*case_count) - 0.5, 42 | 2)/((alt_count/allele_count)*case_count) + 43 | POW(ABS(control_alt_count - (alt_count/allele_count)*control_count) - 0.5, 44 | 2)/((alt_count/allele_count)*control_count), 45 | 3) AS chi_squared_score 46 | FROM ( 47 | SELECT 48 | reference_name, 49 | start, 50 | end, 51 | reference_bases, 52 | alternate_bases, 53 | vt, 54 | SUM(ref_count + alt_count) AS allele_count, 55 | SUM(ref_count) AS ref_count, 56 | SUM(alt_count) AS alt_count, 57 | SUM(IF(TRUE = is_case, INTEGER(ref_count + alt_count), 0)) AS case_count, 58 | SUM(IF(FALSE = is_case, INTEGER(ref_count + alt_count), 0)) AS control_count, 59 | SUM(IF(TRUE = is_case, ref_count, 0)) AS case_ref_count, 60 | SUM(IF(TRUE = is_case, alt_count, 0)) AS case_alt_count, 61 | SUM(IF(FALSE = is_case, ref_count, 0)) AS control_ref_count, 62 | SUM(IF(FALSE = is_case, alt_count, 0)) AS control_alt_count, 63 | FROM ( 64 | SELECT 65 | reference_name, 66 | start, 67 | ('EAS' = super_population) AS is_case, 68 | reference_bases, 69 | alternate_bases, 70 | END, 71 | vt, 72 | # 1000 genomes phase 1 data is bi-allelic so there is only ever a single alt 73 | SUM(0 = call.genotype) WITHIN RECORD AS ref_count, 74 | SUM(1 = call.genotype) WITHIN RECORD AS alt_count, 75 | FROM 76 | FLATTEN(( 77 | SELECT 78 | reference_name, 79 | start, 80 | reference_bases, 81 | alternate_bases, 82 | END, 83 | vt, 84 | call.call_set_name, 85 | call.genotype, 86 | FROM 87 | [genomics-public-data:1000_genomes.variants] 88 | WHERE 89 | reference_name = '12' 90 | ), 91 | call) AS g 92 | JOIN 93 | [genomics-public-data:1000_genomes.sample_info] p 94 | ON 95 | g.call.call_set_name = p.sample 96 | ) 97 | GROUP BY 98 | reference_name, 99 | start, 100 | end, 101 | reference_bases, 102 | alternate_bases, 103 | vt) 104 | WHERE 105 | # For chi-squared, expected counts must be at least 5 for each group 106 | (ref_count/allele_count)*case_count >= 5.0 107 | AND (ref_count/allele_count)*control_count >= 5.0 108 | AND (alt_count/allele_count)*case_count >= 5.0 109 | AND (alt_count/allele_count)*control_count >= 5.0 110 | HAVING 111 | # Chi-squared critical value for df=1, p-value=5*10^-8 is 29.71679 112 | chi_squared_score >= 29.71679 113 | ORDER BY 114 | chi_squared_score DESC, 115 | allele_count DESC 116 | -------------------------------------------------------------------------------- /1000genomes/sql/hardy-weinberg-equilibrium.sql: -------------------------------------------------------------------------------- 1 | # An example of a pattern one might use for Hardy-Weinberg Equilibrium 2 | # queries upon 1,000 Genomes variants. It is specifically computing 3 | # the Hardy-Weinberg Equilibrium for the variants found in BRCA1 and 4 | # then computing the chi-squared score for the observed versus 5 | # expected counts for the calls. 6 | 7 | # http://scienceprimer.com/hardy-weinberg-equilibrium-calculator 8 | # http://www.nfstc.org/pdi/Subject07/pdi_s07_m01_02.htm 9 | # http://www.nfstc.org/pdi/Subject07/pdi_s07_m01_02.p.htm 10 | 11 | SELECT 12 | reference_name, 13 | start, 14 | END, 15 | reference_bases, 16 | alt, 17 | vt, 18 | ROUND(POW(hom_ref_count - expected_hom_ref_count, 19 | 2)/expected_hom_ref_count + 20 | POW(hom_alt_count - expected_hom_alt_count, 21 | 2)/expected_hom_alt_count + 22 | POW(het_count - expected_het_count, 23 | 2)/expected_het_count, 24 | 3) AS chi_squared_score, 25 | total_count, 26 | hom_ref_count, 27 | ROUND(expected_hom_ref_count, 28 | 2) AS expected_hom_ref_count, 29 | het_count, 30 | ROUND(expected_het_count, 31 | 2) AS expected_het_count, 32 | hom_alt_count, 33 | ROUND(expected_hom_alt_count, 34 | 2) AS expected_hom_alt_count, 35 | ROUND(alt_freq, 36 | 4) AS alt_freq, 37 | alt_freq_from_1KG, 38 | FROM ( 39 | SELECT 40 | reference_name, 41 | start, 42 | END, 43 | reference_bases, 44 | alt, 45 | vt, 46 | alt_freq_from_1KG, 47 | hom_ref_freq + (.5 * het_freq) AS hw_ref_freq, 48 | 1 - (hom_ref_freq + (.5 * het_freq)) AS alt_freq, 49 | POW(hom_ref_freq + (.5 * het_freq), 50 | 2) * total_count AS expected_hom_ref_count, 51 | POW(1 - (hom_ref_freq + (.5 * het_freq)), 52 | 2) * total_count AS expected_hom_alt_count, 53 | 2 * (hom_ref_freq + (.5 * het_freq)) 54 | * (1 - (hom_ref_freq + (.5 * het_freq))) 55 | * total_count AS expected_het_count, 56 | total_count, 57 | hom_ref_count, 58 | het_count, 59 | hom_alt_count, 60 | hom_ref_freq, 61 | het_freq, 62 | hom_alt_freq, 63 | FROM ( 64 | SELECT 65 | reference_name, 66 | start, 67 | END, 68 | reference_bases, 69 | alt, 70 | vt, 71 | alt_freq_from_1KG, 72 | # 1000 genomes data IS bi-allelic so there IS only ever a single alt 73 | # We also exclude calls _where one _or both alleles were NOT called (-1) 74 | SUM((0 = first_allele 75 | OR 1 = first_allele) 76 | AND (0 = second_allele 77 | OR 1 = second_allele)) WITHIN RECORD AS total_count, 78 | SUM(0 = first_allele 79 | AND 0 = second_allele) WITHIN RECORD AS hom_ref_count, 80 | SUM((0 = first_allele 81 | AND 1 = second_allele) 82 | OR (1 = first_allele 83 | AND 0 = second_allele)) WITHIN RECORD AS het_count, 84 | SUM(1 = first_allele 85 | AND 1 = second_allele) WITHIN RECORD AS hom_alt_count, 86 | SUM(0 = first_allele 87 | AND 0 = second_allele) / SUM((0 = first_allele 88 | OR 1 = first_allele) 89 | AND (0 = second_allele 90 | OR 1 = second_allele)) WITHIN RECORD AS hom_ref_freq, 91 | SUM((0 = first_allele 92 | AND 1 = second_allele) 93 | OR (1 = first_allele 94 | AND 0 = second_allele)) / SUM((0 = first_allele 95 | OR 1 = first_allele) 96 | AND (0 = second_allele 97 | OR 1 = second_allele)) WITHIN RECORD AS het_freq, 98 | SUM(1 = first_allele 99 | AND 1 = second_allele) / SUM((0 = first_allele 100 | OR 1 = first_allele) 101 | AND (0 = second_allele 102 | OR 1 = second_allele)) WITHIN RECORD AS hom_alt_freq, 103 | FROM ( 104 | SELECT 105 | reference_name, 106 | start, 107 | END, 108 | reference_bases, 109 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 110 | vt, 111 | # Also return the pre-computed allelic frequency to help us check our work 112 | af AS alt_freq_from_1KG, 113 | NTH(1, 114 | call.genotype) WITHIN call AS first_allele, 115 | NTH(2, 116 | call.genotype) WITHIN call AS second_allele, 117 | FROM 118 | [genomics-public-data:1000_genomes.variants] 119 | WHERE 120 | reference_name = '17' 121 | AND start BETWEEN 41196311 122 | AND 41277499 123 | ))) 124 | ORDER BY 125 | reference_name, 126 | start 127 | -------------------------------------------------------------------------------- /1000genomes/data-stories/reproducing-hardy-weinberg-equilibrium/README.Rmd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | Reproducing the Hardy-Weinberg Equilibrium test for BRCA1 in 1,000 Genomes 18 | ======================================================== 19 | 20 | Provenance for the expected result 21 | --------------------------- 22 | First get a slice of the VCF containing just the variants within BRCA1: 23 | ``` 24 | vcftools --gzvcf ALL.chr17.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz --chr 17 --from-bp 41196312 --to-bp 41277500 --out brca1 --recode-INFO-all --recode 25 | 26 | VCFtools - v0.1.11 27 | (C) Adam Auton 2009 28 | 29 | Parameters as interpreted: 30 | --gzvcf ALL.chr17.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz 31 | --chr 17 32 | --to-bp 41277500 33 | --recode-INFO-all 34 | --out brca1 35 | --recode 36 | --from-bp 41196312 37 | 38 | Using zlib version: 1.2.3.4 39 | Versions of zlib >= 1.2.4 will be *much* faster when reading zipped VCF files. 40 | Reading Index file. 41 | File contains 1046733 entries and 1092 individuals. 42 | Filtering by chromosome. 43 | Chromosome: 17 44 | Keeping 1046733 entries on specified chromosomes. 45 | Applying Required Filters. 46 | Filtering sites by chromosome and/or position 47 | After filtering, kept 1092 out of 1092 Individuals 48 | After filtering, kept 879 out of a possible 1046733 Sites 49 | Outputting VCF file... Done 50 | Run Time = 200.00 seconds 51 | ``` 52 | Then run vcftools: 53 | ``` 54 | vcftools --vcf brca1.recode.vcf --hardy 55 | ``` 56 | Producing output file: [out.hwe](./vcftools-output/out.hwe) 57 | 58 | See [details](http://vcftools.sourceforge.net/man_latest.html#OUTPUT OPTIONS) about the --hardy option for vcftools for more detail about the calculaton. 59 | 60 | Reproducing the result via BigQuery 61 | ------------------------------------ 62 | [BRCA1](http://www.genecards.org/cgi-bin/carddisp.pl?gene=BRCA1) resides on chromosome 17 from position 41196312 to 41277500. 63 | 64 | ```{r init, echo=FALSE, message=FALSE, warning=FALSE, comment=NA} 65 | require(bigrquery) 66 | require(ggplot2) 67 | require(dplyr) 68 | require(xtable) 69 | require(testthat) 70 | project <- "google.com:biggene" # put your projectID here 71 | DisplayAndDispatchQuery <- function(queryUri) { 72 | querySql <- readChar(queryUri, nchars=1e6) 73 | cat(querySql) 74 | query_exec(querySql, project) 75 | } 76 | ``` 77 | 78 | Let’s compute the Hardy-Weinberg Equilibrium test for each variant within BRCA1: 79 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA} 80 | result <- DisplayAndDispatchQuery("../../sql/hardy-weinberg-equilibrium.sql") 81 | ``` 82 | Number of rows returned by this query: `r nrow(result)`. 83 | 84 | Displaying the first few rows of our result: 85 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"} 86 | print(head(xtable(result), 10), type="html", include.rownames=F) 87 | ``` 88 | and the last few rows: 89 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"} 90 | print(tail(xtable(result), 10), type="html", include.rownames=F) 91 | ``` 92 | 93 | And do our results match the precomputed values resident in the AF INFO field? 94 | ```{r} 95 | print(expect_equal(object=result$alt_freq, 96 | expected=result$alt_freq_from_1KG, 97 | tolerance=0.005, 98 | scale=1)) 99 | ``` 100 | We can see from the results that when the computed frequency values in column alt_freq are rounded, they exactly match the alternate allele frequencies as reported in the AF INFO field from the 1,000 Genomes VCF data. 101 | 102 | Most importantly, comparing these to the results in [out.hwe](./vcftools-output/out.hwe) from vcftools we see that the test scores match. 103 | -------------------------------------------------------------------------------- /pgp/sql/schema-comparisons/missingness-udf.sql: -------------------------------------------------------------------------------- 1 | # Missingness rate summarized per chromosome. To see it per variant, materialize 2 | # the large result from the inner query to a table. 3 | # 4 | # Note that the new BigQuery feature of user-defined javascript 5 | # functions is in limited preview. For more info, see 6 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377 7 | SELECT 8 | contig_name, 9 | MIN(missingness_rate) AS min_missingness, 10 | AVG(missingness_rate) AS avg_missingness, 11 | MAX(missingness_rate) AS max_missingness, 12 | STDDEV(missingness_rate) AS stddev_missingness, 13 | FROM ( 14 | SELECT 15 | vars.contig_name AS contig_name, 16 | vars.start_pos AS start_pos, 17 | reference_bases, 18 | variant_called_count, 19 | SUM(refs.called_count) AS reference_called_count, 20 | variant_called_count + SUM(refs.called_count) AS num_alleles_called_for_position, 21 | 1 - ((variant_called_count + SUM(refs.called_count))/(172*2)) AS missingness_rate 22 | FROM ( 23 | # _JOIN our variant sample counts with the corresponding reference-matching blocks 24 | SELECT 25 | vars.contig_name, 26 | vars.start_pos, 27 | refs.start_pos, 28 | vars.end_pos, 29 | refs.the_end, 30 | reference_bases, 31 | variant_called_count, 32 | refs.called_count 33 | FROM js( 34 | # Constrain the left hand side of the _JOIN to reference-matching blocks 35 | (SELECT 36 | contig_name, 37 | start_pos, 38 | END, 39 | IF(alternate_bases IS NULL, 40 | FALSE, 41 | TRUE) AS is_variant_call, 42 | SUM(call.genotype >= 0) WITHIN RECORD AS called_count, 43 | FROM 44 | [google.com:biggene:pgp.gvcf_variants] 45 | HAVING 46 | is_variant_call = FALSE), 47 | contig_name, start_pos, END, called_count, 48 | # This User-defined function helps us reduce the size of the cross product 49 | # considered by this JOIN thereby greatly speeding up the query. 50 | "[{name: 'contig_name', type: 'string'}, 51 | {name: 'start_pos', type: 'integer'}, 52 | {name: 'the_end', type: 'integer'}, 53 | {name: 'bin', type: 'integer'}, 54 | {name: 'called_count', type: 'integer'}]", 55 | "function(r, emit) { 56 | var binSize = 5000 57 | var startBin = Math.floor(r.start_pos / binSize); 58 | var endBin = Math.floor(r.END / binSize); 59 | // Since a reference-matching block can span multiple bins, emit 60 | // a record for each bin. 61 | for(var bin = startBin; bin <= endBin; bin++) { 62 | emit({ 63 | contig_name: r.contig_name, 64 | start_pos: r.start_pos, 65 | the_end: r.END, 66 | bin: bin, 67 | called_count: r.called_count 68 | }); 69 | } 70 | }") AS refs 71 | JOIN EACH ( 72 | # Constrain the right hand side of the _JOIN to variants 73 | # _GROUP our variant sample counts together since a single SNP may be IN more than 74 | # one row due 1 / 2 genotypes 75 | SELECT 76 | contig_name, 77 | start_pos, 78 | end_pos, 79 | INTEGER(FLOOR(start_pos / 5000)) AS bin, 80 | reference_bases, 81 | SUM(called_count) AS variant_called_count, 82 | FROM ( 83 | # _LIMIT the query to SNPs 84 | SELECT 85 | contig_name, 86 | start_pos, 87 | end_pos, 88 | reference_bases, 89 | LENGTH(reference_bases) AS ref_len, 90 | MIN(LENGTH(alternate_bases)) WITHIN RECORD AS alt_len, 91 | IF(alternate_bases IS NULL, 92 | FALSE, 93 | TRUE) AS is_variant_call, 94 | SUM(call.genotype >= 0) WITHIN RECORD AS called_count, 95 | FROM 96 | [google.com:biggene:pgp.gvcf_variants] 97 | HAVING 98 | ref_len = 1 99 | AND alt_len = 1 100 | AND is_variant_call) 101 | GROUP EACH BY 102 | contig_name, 103 | start_pos, 104 | end_pos, 105 | bin, 106 | reference_bases) AS vars 107 | # The _JOIN criteria IS complicated since we are trying to see if a SNP overlaps an interval 108 | ON 109 | vars.contig_name = refs.contig_name 110 | AND vars.bin = refs.bin 111 | WHERE 112 | refs.start_pos <= vars.start_pos 113 | AND refs.the_end >= vars.end_pos 114 | ) 115 | GROUP EACH BY 116 | contig_name, 117 | start_pos, 118 | reference_bases, 119 | variant_called_count 120 | ) 121 | GROUP BY 122 | contig_name 123 | ORDER BY 124 | contig_name -------------------------------------------------------------------------------- /pgp/sql/cgi_variants/allelic-frequency-brca1.sql: -------------------------------------------------------------------------------- 1 | # The following query computes the allelic frequency for BRCA1 variants in the 2 | # PGP dataset. 3 | # 4 | # Note that the new BigQuery feature of user-defined javascript 5 | # functions is in limited preview. For more info, see 6 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377 7 | SELECT 8 | vars.chromosome AS chromosome, 9 | vars.reference AS reference, 10 | vars.locusBegin AS locusBegin, 11 | vars.locusEnd AS locusEnd, 12 | vars.allele AS allele, 13 | alternate_allele_count, 14 | num_samples_called, 15 | ROUND(alternate_allele_count / (2*num_samples_called), 16 | 4) AS freq, 17 | FROM ( 18 | SELECT 19 | vars.chromosome, 20 | vars.reference, 21 | vars.locusBegin, 22 | vars.locusEnd, 23 | vars.allele, 24 | alternate_allele_count, 25 | SUM(num_samples) AS num_samples_called 26 | FROM ( 27 | # The left hand side of our JOIN are counts of alternate allele values at 28 | # a particular locus 29 | SELECT 30 | chromosome, 31 | reference, 32 | INTEGER(FLOOR(locusBegin / 5000)) AS bin, 33 | locusBegin, 34 | locusEnd, 35 | allele, 36 | SUM(cnt) AS alternate_allele_count, 37 | FROM ( 38 | SELECT 39 | chromosome, 40 | reference, 41 | locusBegin, 42 | locusEnd, 43 | allele1Seq AS allele, 44 | COUNT(1) AS cnt 45 | FROM 46 | [google.com:biggene:pgp.cgi_variants] 47 | WHERE 48 | chromosome = 'chr17' 49 | AND locusBegin BETWEEN 41196311 50 | AND 41277499 51 | AND (reference != '=' OR reference IS NULL) 52 | AND allele1Seq != '?' 53 | AND (reference != allele1Seq OR reference IS NULL) 54 | GROUP BY 55 | chromosome, 56 | reference, 57 | locusBegin, 58 | locusEnd, 59 | allele), 60 | ( 61 | SELECT 62 | chromosome, 63 | reference, 64 | locusBegin, 65 | locusEnd, 66 | allele2Seq AS allele, 67 | COUNT(1) AS cnt 68 | FROM 69 | [google.com:biggene:pgp.cgi_variants] 70 | WHERE 71 | chromosome = 'chr17' 72 | AND locusBegin BETWEEN 41196311 73 | AND 41277499 74 | AND (reference != '=' OR reference IS NULL) 75 | AND allele2Seq != '?' 76 | AND (reference != allele2Seq OR reference IS NULL) 77 | GROUP BY 78 | chromosome, 79 | reference, 80 | locusBegin, 81 | locusEnd, 82 | allele) 83 | GROUP BY 84 | chromosome, 85 | reference, 86 | bin, 87 | locusBegin, 88 | locusEnd, 89 | allele) AS vars 90 | JOIN 91 | EACH ( 92 | # The right hand side of our JOIN is are all the calls, including 93 | # reference calls and no-calls 94 | SELECT 95 | num_samples, 96 | chromosome, 97 | bin, 98 | locusBegin, 99 | locusEnd 100 | FROM ( 101 | SELECT 102 | COUNT(sample_id) AS num_samples, 103 | chromosome, 104 | reference, 105 | bin, 106 | locusBegin, 107 | locusEnd 108 | # This User-defined function helps us reduce the size of the cross product 109 | # considered by this JOIN thereby greatly speeding up the query 110 | FROM js( 111 | (SELECT sample_id, chromosome, reference, locusBegin, locusEnd, 112 | FROM [google.com:biggene:pgp.cgi_variants] 113 | WHERE chromosome = 'chr17'), 114 | sample_id, chromosome, reference, locusBegin, locusEnd, 115 | "[{name: 'sample_id', type: 'string'}, 116 | {name: 'chromosome', type: 'string'}, 117 | {name: 'reference', type: 'string'}, 118 | {name: 'bin', type: 'integer'}, 119 | {name: 'locusBegin', type: 'integer'}, 120 | {name: 'locusEnd', type: 'integer'}]", 121 | "function(r, emit) { 122 | var binSize = 5000 123 | var startBin = Math.floor(r.locusBegin / binSize); 124 | var endBin = Math.floor(r.locusEnd / binSize); 125 | for(var bin = startBin; bin <= endBin; bin++) { 126 | emit({ 127 | sample_id: r.sample_id, 128 | chromosome: r.chromosome, 129 | reference: r.reference, 130 | bin: bin, 131 | locusBegin: r.locusBegin, 132 | locusEnd: r.locusEnd, 133 | }); 134 | } 135 | }") 136 | GROUP EACH BY 137 | chromosome, 138 | reference, 139 | bin, 140 | locusBegin, 141 | locusEnd 142 | )) AS all 143 | ON 144 | vars.chromosome = all.chromosome 145 | AND vars.bin = all.bin 146 | WHERE 147 | # Further constrain the JOIN to calls that overlapped the first base pair 148 | # of this variant 149 | all.locusBegin <= vars.locusBegin 150 | AND all.locusEnd >= vars.locusBegin+1 151 | GROUP BY 152 | vars.chromosome, 153 | vars.reference, 154 | vars.locusBegin, 155 | vars.locusEnd, 156 | vars.allele, 157 | alternate_allele_count 158 | ) 159 | ORDER BY 160 | chromosome, 161 | locusBegin 162 | -------------------------------------------------------------------------------- /pgp/provenance/cgi-ref-blocks-mapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2014 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """Add extract reference-matching records from CGI data and conver to VCF. 18 | 19 | Assumptions: 20 | - one sample per input file 21 | 22 | 23 | This script can be run standalone: 24 | cat masterVarBeta-GS000016446-ASM.tsv | ./cgi-ref-blocks-mapper.py 25 | 26 | Or via the debugger: 27 | python -mpdb ./cgi-ref-blocks-mapper.py masterVarBeta-GS000016446-ASM.tsv 28 | 29 | To have the sample id correctly parsed when input is from stdin, set the 30 | environment variable that Hadoop would set: 31 | export map_input_file=./huDBF9DD/masterVarBeta-GS000016446-ASM.tsv.bz2 32 | bzcat ./huDBF9DD/masterVarBeta-GS000016446-ASM.tsv.bz2 | ./cgi-ref-blocks-mapper.py 33 | 34 | To have the sample id correctly parsed when input is from a file, ensure that it 35 | is in the file path: 36 | python -mpdb ./cgi-mapper.py ./huDBF9DD/masterVarBeta-GS000016446-ASM.tsv 37 | 38 | It should be run as a mapper-only Hadoop Streaming job: 39 | hadoop jar /path/to/your/hadoop-streaming-*.jar \ 40 | -libjars /home/deflaux/custom.jar \ 41 | -outputformat com.custom.CustomMultiOutputFormat \ 42 | -mapper cgi-ref-blocks-mapper.py -file cgi-ref-blocks-mapper.py \ 43 | --numReduceTasks 0 -input inputpath -output outputpath 44 | 45 | Notice that there is a special output format to put the VCF header 46 | back into the output files including the specific sample id. 47 | 48 | See also https://cloud.google.com/hadoop/ and 49 | http://stackoverflow.com/questions/18541503/multiple-output-files-for-hadoop-streaming-with-python-mapper 50 | 51 | """ 52 | 53 | import os 54 | import re 55 | import sys 56 | 57 | ### Constants 58 | INPUT_FILE_KEY = "map_input_file" 59 | SAMPLE_ID_PATTERN = "/(hu[A-F0-9]{6})/" 60 | # This genome was sequenced twice, this is the path of the older of the two 61 | DUPLICATE_GENOME = "gs://pgp-harvard-data-public/hu34D5B9/GS000012763-DID/GS000010327-ASM/GS01173-DNA_C07/ASM/masterVarBeta-GS000010327-ASM.tsv.bz2" 62 | # These genomes did not successfully get converted to VCF by cgatools mkvcf 63 | MKVCF_FAILED_GENOMES = ["huEDF7DA", "hu34D5B9"] 64 | 65 | # CGI masterVar field indices 66 | CHROMOSOME = 2 67 | LOCUS_BEGIN = 3 68 | LOCUS_END = 4 69 | REFERENCE = 7 70 | ALLELE1SEQ = 8 71 | ALLELE2SEQ = 9 72 | 73 | 74 | def main(): 75 | """Entry point to the script.""" 76 | 77 | sample_id = None 78 | sample_id_re = re.compile(SAMPLE_ID_PATTERN) 79 | 80 | # Basic parsing of command line arguments to allow a filename 81 | # to be passed when running this code in the debugger. 82 | path = None 83 | file_handle = sys.stdin 84 | if 2 <= len(sys.argv): 85 | path = sys.argv[1] 86 | file_handle = open(path, "r") 87 | elif INPUT_FILE_KEY in os.environ: 88 | path = os.environ[INPUT_FILE_KEY] 89 | print >> sys.stderr, path 90 | 91 | if path is not None: 92 | match = sample_id_re.search(path) 93 | if match: 94 | sample_id = match.group(1) 95 | 96 | line = file_handle.readline() 97 | while line: 98 | line = line.rstrip("\n") 99 | 100 | if DUPLICATE_GENOME == path: 101 | # hu34D5B9 was sequenced twice, skip the older genome 102 | pass 103 | elif sample_id in MKVCF_FAILED_GENOMES: 104 | # Don't bother extracting ref-matching blocks for the genomes for which 105 | # we were unable to run cgatools mkvcf 106 | pass 107 | elif not line: 108 | # This is a blank line, skip it 109 | pass 110 | elif "#" == line[0]: 111 | # This is a header line, skip it 112 | pass 113 | elif ">" == line[0]: 114 | # This is the column header line, skip it 115 | pass 116 | else: 117 | fields = line.split("\t") 118 | if ("=" == fields[REFERENCE] and "=" == fields[ALLELE1SEQ] 119 | and ("=" == fields[ALLELE2SEQ] or "" == fields[ALLELE2SEQ])): 120 | # This is a reference-matching record, emit it 121 | contig = fields[CHROMOSOME].replace("chr", "", 1) 122 | start_pos = int(fields[LOCUS_BEGIN]) + 1 123 | end = int(fields[LOCUS_END]) 124 | # The key is used by the custom output format to put the 125 | # resulting files in a subdirectory specific to the sample 126 | # and also as part of one of the VCF header lines. 127 | key = sample_id 128 | value = "%s\t%d\t.\tN\t.\t.\t.\tNS=1;AN=0;END=%d\tGT:PS\t0/0:." % ( 129 | contig, start_pos, end) 130 | print "%s\t%s" % (key, value) 131 | 132 | line = file_handle.readline() 133 | 134 | if __name__ == "__main__": 135 | main() 136 | -------------------------------------------------------------------------------- /pgp/sql/gvcf_variants/allelic-frequency-brca1.sql: -------------------------------------------------------------------------------- 1 | # The following query computes the allelic frequency for BRCA1 variants in the 2 | # PGP dataset. 3 | # 4 | # Note that the new BigQuery feature of user-defined javascript 5 | # functions is in limited preview. For more info, see 6 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377 7 | SELECT 8 | vars.contig_name AS contig_name, 9 | vars.reference_bases AS reference_bases, 10 | vars.start_pos AS start_pos, 11 | vars.alternate_bases AS allele, 12 | alternate_allele_count, 13 | num_alleles_called, 14 | ROUND(alternate_allele_count / num_alleles_called, 15 | 4) AS freq, 16 | FROM ( 17 | SELECT 18 | vars.contig_name, 19 | vars.reference_bases, 20 | vars.start_pos, 21 | vars.alternate_bases, 22 | alternate_allele_count, 23 | SUM(num_alleles_called) AS num_alleles_called, 24 | FROM ( 25 | # The left hand side of our JOIN is are all the calls, including 26 | # reference_bases calls and no-calls 27 | SELECT 28 | SUM(num_alleles_called) AS num_alleles_called, 29 | contig_name, 30 | reference_bases, 31 | bin, 32 | start_pos, 33 | the_end, 34 | # This User-defined function helps us reduce the size of the cross product 35 | # considered by this JOIN thereby greatly speeding up the query 36 | FROM js( 37 | (SELECT contig_name, reference_bases, start_pos, end_pos, END, call.genotype, 38 | FROM [google.com:biggene:pgp.gvcf_variants] 39 | WHERE contig_name = '17'), 40 | contig_name, reference_bases, start_pos, end_pos, END, call.genotype, 41 | "[{name: 'num_alleles_called', type: 'integer'}, 42 | {name: 'contig_name', type: 'string'}, 43 | {name: 'reference_bases', type: 'string'}, 44 | {name: 'bin', type: 'integer'}, 45 | {name: 'start_pos', type: 'integer'}, 46 | {name: 'the_end', type: 'integer'}]", 47 | "function(r, emit) { 48 | var num_alleles_called = 0; 49 | for(var c in r.call) { 50 | for(var g in r.call[c].genotype) { 51 | if(0 <= r.call[c].genotype[g]) { 52 | num_alleles_called++; 53 | } 54 | } 55 | } 56 | var binSize = 5000 57 | var startBin = Math.floor(r.start_pos / binSize); 58 | var theEnd = (r.END === null) ? r.end_pos : r.END; 59 | var endBin = Math.floor(theEnd / binSize); 60 | for(var bin = startBin; bin <= endBin; bin++) { 61 | emit({ 62 | num_alleles_called: num_alleles_called, 63 | contig_name: r.contig_name, 64 | reference_bases: r.reference_bases, 65 | bin: bin, 66 | start_pos: r.start_pos, 67 | the_end: theEnd 68 | }); 69 | } 70 | }") 71 | GROUP EACH BY 72 | contig_name, 73 | reference_bases, 74 | bin, 75 | start_pos, 76 | the_end 77 | ) AS all 78 | JOIN 79 | EACH 80 | # The right hand side of our JOIN are counts of alternate allele values at 81 | # a particular locus 82 | (SELECT 83 | contig_name, 84 | start_pos, 85 | # This 'bin' can be use in subsequent interval JOINs 86 | INTEGER(FLOOR(start_pos / 5000)) AS bin, 87 | reference_bases, 88 | alternate_bases, 89 | SUM(alternate_allele_count) AS alternate_allele_count, 90 | FROM ( 91 | SELECT contig_name, start_pos, reference_bases, alternate_bases, alt_count 92 | FROM js( 93 | (SELECT contig_name, start_pos, reference_bases, alternate_bases, call.genotype, 94 | FROM [google.com:biggene:pgp.gvcf_variants] 95 | WHERE contig_name = '17'), 96 | contig_name, start_pos, reference_bases, alternate_bases, call.genotype, 97 | "[{name: 'contig_name', type: 'string'}, 98 | {name: 'start_pos', type: 'integer'}, 99 | {name: 'reference_bases', type: 'string'}, 100 | {name: 'alternate_bases', type: 'string'}, 101 | {name: 'alternate_allele_count', type: 'integer'}]", 102 | "function(r, emit) { 103 | for(var a in r.alternate_bases) { 104 | var alt_gt = a + 1; 105 | var alt_count = 0; 106 | for(var c in r.call) { 107 | for(var g in r.call[c].genotype) { 108 | if(alt_gt == r.call[c].genotype[g]) { 109 | alt_count++; 110 | } 111 | } 112 | } 113 | // Emit one record per alt 114 | emit({ 115 | contig_name: r.contig_name, 116 | start_pos: r.start_pos, 117 | reference_bases: r.reference_bases, 118 | alternate_bases: r.alternate_bases[a], 119 | alternate_allele_count: alt_count 120 | }); 121 | } 122 | }")) 123 | WHERE 124 | contig_name = '17' 125 | AND start_pos BETWEEN 41196312 126 | AND 41277500 127 | GROUP EACH BY 128 | contig_name, 129 | start_pos, 130 | bin, 131 | reference_bases, 132 | alternate_bases) AS vars 133 | ON 134 | vars.contig_name = all.contig_name 135 | AND vars.bin = all.bin 136 | WHERE 137 | # Further constrain the JOIN to calls that overlapped the first base pair 138 | # of this variant 139 | all.start_pos <= vars.start_pos 140 | AND all.the_end >= vars.start_pos+1 141 | GROUP EACH BY 142 | vars.contig_name, 143 | vars.reference_bases, 144 | vars.start_pos, 145 | vars.alternate_bases, 146 | alternate_allele_count 147 | ) 148 | -------------------------------------------------------------------------------- /1000genomes_phase3/README.Rmd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 1,000 Genomes Phase 3 Variants 18 | ============================== 19 | 20 | ### Additional Resources 21 | * [Schema](https://bigquery.cloud.google.com/table/genomics-public-data:1000_genomes_phase_3.variants_20150220_release?pli=1) 22 | * [Data Provenance](http://googlegenomics.readthedocs.io/en/latest/use_cases/discover_public_data/1000_genomes.html) 23 | * [Index of variant analyses](./sql) 24 | 25 | ### Metrics 26 | 27 | In the following plots we display metrics computed on both 1,000 Genomes phase 1 and phase 3 variants. 28 | 29 | ```{r echo=FALSE, eval=FALSE} 30 | ######################[ CHANGE ME ]################################## 31 | # This codelab assumes that the current working directory is where the Rmd file resides. 32 | setwd("/YOUR/PATH/TO/bigquery-examples/1000genomes_phase3") 33 | 34 | # Set the Google Cloud Platform project id under which these queries will run. 35 | project <- "YOUR-PROJECT-ID" 36 | ##################################################################### 37 | 38 | # The currently released version 0.3.0 does not yet have the parameter 39 | # to use Standard SQL instead of Legacy SQL, so we install from github. 40 | library(devtools) 41 | install_github('rstats-db/bigrquery') 42 | ``` 43 | 44 | ```{r init, echo=FALSE, message=FALSE, warning=FALSE, comment=NA} 45 | library(bigrquery) 46 | library(ggplot2) 47 | library(scales) 48 | library(dplyr) 49 | ``` 50 | 51 | ```{r echo=FALSE, message=FALSE, warning=FALSE} 52 | sample_info <- read.csv("http://storage.googleapis.com/genomics-public-data/1000-genomes/other/sample_info/sample_info.csv") 53 | phase3 <- query_exec( 54 | "SELECT * FROM `google.com:biggene.1000genomes_analysis_results.phase3_metrics`", 55 | project, use_legacy_sql = FALSE) 56 | phase1 <- query_exec( 57 | "SELECT * FROM `google.com:biggene.1000genomes_analysis_results.phase1_metrics`", 58 | project, use_legacy_sql = FALSE) 59 | results <- inner_join(sample_info, 60 | rbind_list(mutate(phase3, dataset = "phase3"), 61 | mutate(phase1, dataset = "phase1")), 62 | by=c("Sample" = "call_call_set_name")) 63 | ``` 64 | 65 | ```{r titv_metrics, echo=FALSE, message=FALSE, warning=FALSE, comment=NA, fig.align="center", fig.width=12, fig.height=8} 66 | ggplot(results, aes(dataset, ti_tv_ratio)) + 67 | geom_boxplot(aes(colour = Super_Population)) + 68 | ylab("Ti/Tv Ratio") + 69 | xlab("Dataset") + 70 | ggtitle("Transition/Transversion SNP Ratio") 71 | 72 | ggplot(results, aes(dataset, ti_tv_ratio)) + 73 | geom_boxplot(aes(colour = Gender)) + 74 | ylab("Ti/Tv Ratio") + 75 | xlab("Dataset") + 76 | ggtitle("Transition/Transversion SNP Ratio") 77 | ``` 78 | 79 | ```{r hethom_metrics, echo=FALSE, message=FALSE, warning=FALSE, comment=NA, fig.align="center", fig.width=12, fig.height=8} 80 | ggplot(results, aes(dataset, het_hom_ratio)) + 81 | geom_boxplot(aes(colour = Super_Population)) + 82 | ylab("Het/Hom Ratio") + 83 | xlab("Dataset") + 84 | ggtitle("Heterozygous/Homozygous Variant Ratio") 85 | 86 | ggplot(results, aes(dataset, het_hom_ratio)) + 87 | geom_boxplot(aes(colour = Gender)) + 88 | ylab("Het/Hom Ratio") + 89 | xlab("Dataset") + 90 | ggtitle("Heterozygous/Homozygous Variant Ratio") 91 | ``` 92 | 93 | ```{r indel_metrics, echo=FALSE, message=FALSE, warning=FALSE, comment=NA, fig.align="center", fig.width=12, fig.height=8} 94 | ggplot(results, aes(dataset, ins_del_ratio)) + 95 | geom_boxplot(aes(colour = Super_Population)) + 96 | ylab("Indel Ratio") + 97 | xlab("Dataset") + 98 | ggtitle("Insertion/Deletion Ratio") 99 | 100 | ggplot(results, aes(dataset, ins_del_ratio)) + 101 | geom_boxplot(aes(colour = Gender)) + 102 | ylab("Indel Ratio") + 103 | xlab("Dataset") + 104 | ggtitle("Insertion/Deletion Ratio") 105 | ``` 106 | 107 | ### Analysis Provenance 108 | 109 | The following query was run over 1,000 Genomes data: 110 | 111 | 1. phase 3 variantsfor 2,504 individuals and materialized to table [google.com:biggene:1000genomes_analysis_results.phase3_metrics](https://bigquery.cloud.google.com/table/google.com:biggene:1000genomes_analysis_results.phase3_metrics?pli=1) 112 | 2. phase 1 variants for 1095 individuals and materialized to table [google.com:biggene:1000genomes_analysis_results.phase1_metrics](https://bigquery.cloud.google.com/table/google.com:biggene:1000genomes_analysis_results.phase1_metrics?pli=1) 113 | * note that 1,092 individuals have variants across the entire genome 114 | * two individuals have variants on chrM only 115 | * one individual has variants on chrY only 116 | 117 | Note that it was written to specifically handle the multi-allelic data found in phase 3. 118 | 119 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, cache=FALSE} 120 | cat(readChar("./sql/qc-metrics.sql", nchars=1e6)) 121 | ``` 122 | -------------------------------------------------------------------------------- /1000genomes_phase3/sql/qc-metrics.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | -- 3 | -- Compute several common metrics on multi-allelic data. This will work on data 4 | -- in either "genome call" format or in "multi-sample variants" format since hom_RR_count 5 | -- is not used in any of the ratios computed. 6 | -- http://googlegenomics.readthedocs.io/en/latest/use_cases/load_data/multi_sample_variants.html 7 | -- 8 | -- Edit the BigQuery table name below to run this query on other data such as 1,000 9 | -- Genomes phase 1 variants. 10 | -- http://googlegenomics.readthedocs.io/en/latest/use_cases/discover_public_data/1000_genomes.html 11 | -- 12 | WITH calls AS ( 13 | -- For multi-allelic data we FLATTEN by both alternate_bases and call which yields the 14 | -- cross product of those values. We provide the alt_num value so that queries downstream 15 | -- of this can correctly identify calls with genotypes that match the particular alternate. 16 | SELECT 17 | reference_name, 18 | start, 19 | reference_bases, 20 | alt, 21 | alt_offset + 1 AS alt_num, -- The number corresponding to the alternate_bases value. 22 | CONCAT(reference_bases, '->', alt) AS mutation, 23 | REGEXP_CONTAINS(alt, r'^[ACGT]+$') AS is_sequence, 24 | call.call_set_name, 25 | (SELECT LOGICAL_AND(gt = 0) FROM UNNEST(call.genotype) gt) AS reference_match_call, 26 | call.genotype[SAFE_ORDINAL(1)] AS first_allele, 27 | call.genotype[SAFE_ORDINAL(2)] AS second_allele 28 | FROM 29 | -- To run on phase 1 variants, update the following line to change the source table. 30 | `genomics-public-data.1000_genomes_phase_3.variants_20150220_release` v, 31 | v.call call, v.alternate_bases alt WITH OFFSET alt_offset 32 | -- Use this WHERE clause for fast testing of the query. Remove it for the full analysis. 33 | # WHERE 34 | # reference_name IN ('chr17','17') 35 | # AND start BETWEEN 41196311 AND 41277499 # per GRCh37 36 | ), 37 | 38 | compute_metrics AS ( 39 | SELECT 40 | call_set_name, 41 | -- Anchor on alt_num=1 so that we don't over count hom_RR for multi-allelic sites. 42 | alt_num = 1 AND reference_match_call AS is_hom_RR, 43 | -- Otherwise, check whether the genotypes in the call match this alt_num. 44 | first_allele = alt_num AND (second_allele = alt_num OR second_allele IS NULL) AS is_hom_AA, 45 | (first_allele = 0 AND second_allele = alt_num) OR (first_allele = alt_num AND second_allele = 0) AS is_het_RA, 46 | (first_allele > 0 AND first_allele != alt_num AND second_allele = alt_num) 47 | OR (first_allele = alt_num AND second_allele > 0 AND second_allele != alt_num) AS is_het_AA, 48 | -- To prevent over counting of variant types due to the FLATTENED data, we make sure the genotype 49 | -- in the call corresponds to the the alt_num in this row and use this boolean in the downstream 50 | -- query. 51 | first_allele = alt_num OR second_allele = alt_num AS call_has_alternate_bases, 52 | NOT is_sequence AS is_sv, 53 | is_sequence AND LENGTH(reference_bases) = 1 AND LENGTH(alt) = 1 AS is_snp, 54 | is_sequence AND LENGTH(reference_bases) > 1 AND LENGTH(reference_bases) = LENGTH(alt) AS is_expanded_snp, 55 | is_sequence AND LENGTH(reference_bases) < LENGTH(alt) AS is_insertion, 56 | is_sequence AND LENGTH(reference_bases) > LENGTH(alt) AS is_deletion, 57 | mutation IN ('A->G','G->A','C->T','T->C') AS is_transition, 58 | mutation IN ('A->C','C->A','G->T','T->G','A->T','T->A','C->G','G->C') AS is_transversion 59 | FROM calls 60 | ), 61 | 62 | compute_sums AS ( 63 | SELECT 64 | call_set_name, 65 | SUM(CAST(is_hom_RR AS INT64)) AS hom_RR_count, 66 | SUM(CAST(is_hom_AA AS INT64)) AS hom_AA_count, 67 | SUM(CAST(is_het_RA AS INT64)) AS het_RA_count, 68 | -- Divide by het_AA two since we have two rows for this sample's alleles because we 69 | -- FLATTENED by alternate_bases. 70 | SUM(CAST(is_het_AA AS INT64))/2 AS het_AA_count, 71 | SUM(CAST(call_has_alternate_bases AS INT64)) AS calls_has_alternate_bases_count, 72 | SUM(CAST(call_has_alternate_bases AND is_sv AS INT64)) AS sv_count, 73 | SUM(CAST(call_has_alternate_bases AND is_snp AS INT64)) AS snp_count, 74 | SUM(CAST(call_has_alternate_bases AND is_expanded_snp AS INT64)) AS expanded_snp_count, 75 | SUM(CAST(call_has_alternate_bases AND is_insertion AS INT64)) AS insertion_count, 76 | SUM(CAST(call_has_alternate_bases AND is_deletion AS INT64)) AS deletion_count, 77 | SUM(CAST(call_has_alternate_bases AND is_transition AS INT64)) AS transitions_count, 78 | SUM(CAST(call_has_alternate_bases AND is_transversion AS INT64)) AS transversions_count 79 | FROM compute_metrics 80 | GROUP BY 81 | call_set_name 82 | ) 83 | 84 | SELECT 85 | call_set_name, 86 | -- Ratios. 87 | SAFE_DIVIDE(transitions_count, transversions_count) AS ti_tv_ratio, 88 | SAFE_DIVIDE((het_RA_count + 2 * het_AA_count), hom_AA_count) AS het_hom_ratio, 89 | SAFE_DIVIDE(insertion_count, deletion_count) AS ins_del_ratio, 90 | -- Call type counts. 91 | hom_RR_count, 92 | hom_AA_count, 93 | het_RA_count, 94 | het_AA_count, 95 | -- Alternate allele type counts. 96 | sv_count, 97 | snp_count, 98 | expanded_snp_count, 99 | insertion_count, 100 | deletion_count, 101 | -- SNP type counts. 102 | transitions_count, 103 | transversions_count, 104 | -- Let's check our work for over/under counting. 105 | calls_has_alternate_bases_count, 106 | transitions_count + transversions_count AS check_snp_count, 107 | sv_count + snp_count + expanded_snp_count + insertion_count + deletion_count AS check_calls_has_alternate_bases_count, 108 | hom_RR_count + hom_AA_count + het_RA_count + het_AA_count AS check_total_num_calls 109 | FROM compute_sums 110 | ORDER BY 111 | call_set_name 112 | -------------------------------------------------------------------------------- /1000genomes/data-stories/reproducing-allelic-frequencies/README.Rmd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | Reproducing 1,000 Genomes allele frequencies for variants in BRCA1 18 | ======================================================== 19 | 20 | The following query computes the frequency of both the reference and alternate SNPs within BRCA1 for all samples within 1,000 Genomes. 21 | 22 | ```{r init, echo=FALSE, message=FALSE, warning=FALSE, comment=NA} 23 | require(bigrquery) 24 | require(ggplot2) 25 | require(dplyr) 26 | require(xtable) 27 | require(testthat) 28 | project <- "google.com:biggene" # put your projectID here 29 | DisplayAndDispatchQuery <- function(queryUri) { 30 | querySql <- readChar(queryUri, nchars=1e6) 31 | cat(querySql) 32 | query_exec(querySql, project) 33 | } 34 | ``` 35 | 36 | ```{r af, echo=FALSE, message=FALSE, warning=FALSE, error=FALSE, comment=NA, cache=FALSE} 37 | result <- DisplayAndDispatchQuery("../../sql/reproducing-allelic-frequencies/reproducing-allelic-frequency.sql") 38 | ``` 39 | Number of rows returned by this query: `r nrow(result)`. 40 | 41 | Displaying the first few rows of our result: 42 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"} 43 | print(xtable(head(result), digits=6), type="html", include.rownames=F) 44 | ``` 45 | 46 | And do our results match the precomputed values resident in the AF INFO field? 47 | ```{r} 48 | print(expect_equal(object=result$alt_freq, 49 | expected=result$alt_freq_from_1KG, 50 | tolerance=0.005, 51 | scale=1)) 52 | ``` 53 | We can see from the results that when the computed frequency values in column alt_freq are rounded, they exactly match the alternate allele frequencies as reported in the AF INFO field from the 1,000 Genomes VCF data. 54 | 55 | Next, we compute those same alternate allele frequencies further broken down by super population groups. 56 | ```{r afeth, echo=FALSE, message=FALSE, warning=FALSE, error=FALSE, comment=NA, cache=FALSE} 57 | result <- DisplayAndDispatchQuery("../../sql/reproducing-allelic-frequencies/reproducing-allelic-frequency-by-ethnicity.sql") 58 | ``` 59 | Number of rows returned by this query: `r nrow(result)`. 60 | 61 | Displaying the first few rows of our result: 62 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"} 63 | print(xtable(head(result), digits=6), type="html", include.rownames=F) 64 | ``` 65 | 66 | And do our results match the precomputed values resident in the superpopulation-specific AF INFO fields? 67 | ```{r} 68 | # coerce NAs to be zero 69 | result$alt_freq_from_1KG[is.na(result$alt_freq_from_1KG)] <- 0.0 70 | print(expect_equal(object=result$alt_freq, 71 | expected=result$alt_freq_from_1KG, 72 | tolerance=0.005, 73 | scale=1)) 74 | ``` 75 | We can see from the results that when the computed frequency values in column alt_freq are rounded, they exactly match the alternate allele frequencies as reported in the AFR_AF, ASN_AF, AMR_AF, EUR_AF INFO fields from the 1,000 Genomes VCF data. 76 | 77 | Moving onto other results regarding rates of variation across populations: 78 | ```{r sql maf, echo=FALSE, message=FALSE, warning=FALSE, error=FALSE, comment=NA, cache=FALSE} 79 | result <- DisplayAndDispatchQuery("../../sql/minimum-allelic-frequency-by-ethnicity.sql") 80 | ``` 81 | Number of rows returned by this query: `r nrow(result)`. 82 | 83 | Displaying the first few rows of our result: 84 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"} 85 | print(xtable(head(result), digits=6), type="html", include.rownames=F) 86 | ``` 87 | 88 | Some data visualization will help us to see more clearly the pattern resident within the results: 89 | ```{r maf, echo=FALSE, fig.align="center", fig.width=12, fig.height=8} 90 | ggplot(result, aes(x=population, y=common_variant, fill=super_population)) + geom_boxplot() + ylab("Count of common variants per sample") + ggtitle("Common Variants (Minimum Allelic Frequency 5%)") 91 | ``` 92 | and now its clear to see that the ethnicities within the African super population have a much higher rate of mutation compared to the other ethnicities for the common variants. 93 | 94 | This difference is even more notable when looking at all variants: 95 | ```{r all variants, echo=FALSE, fig.align="center", fig.width=12, fig.height=8} 96 | ggplot(result, aes(x=population, y=num_variants_for_sample, fill=super_population)) + geom_boxplot() + ylab("Count variants per sample") + ggtitle("All Variants") 97 | ``` 98 | 99 | Now lets examine the rate of variation across genders: 100 | ```{r common variants by gender, echo=FALSE, fig.align="center", fig.width=12, fig.height=8} 101 | ggplot(result, aes(x=super_population, y=common_variant, fill=gender)) + geom_boxplot() + ylab("Count of common variants per sample") + ggtitle("Common Variants (Minimum Allelic Frequency 5%)") 102 | ``` 103 | We see a noticieable difference, BUT this query included variants within chromosome X. Updating the query to ignore sex chromosomes: 104 | ```{r sql maf no X, echo=FALSE, message=FALSE, warning=FALSE, error=FALSE, comment=NA, cache=FALSE} 105 | result <- DisplayAndDispatchQuery("../../sql/minimum-allelic-frequency-by-ethnicity-no-sex-chromosomes.sql") 106 | ``` 107 | We see that the genders are quite close in their rate of variation. 108 | ```{r viz maf no X, echo=FALSE, fig.align="center", fig.width=12, fig.height=8} 109 | ggplot(result, aes(x=super_population, y=common_variant, fill=gender)) + geom_boxplot() + ylab("Count of common variants per sample") + ggtitle("Common Variants (Minimum Allelic Frequency 5%)") 110 | ``` 111 | --------------------------------------------------------------------------------