├── pgp
    ├── .gitignore
    ├── data-stories
    │   ├── schema-comparisons
    │   │   ├── .gitignore
    │   │   ├── figure
    │   │   │   ├── call_cnt.png
    │   │   │   ├── variant_cnt.png
    │   │   │   ├── sample_call_cnt.png
    │   │   │   └── sample_variant_cnt.png
    │   │   └── schema-comparison-observations.csv
    │   ├── comparing-pgp-to-1000genomes
    │   │   ├── .gitignore
    │   │   └── figure
    │   │   │   ├── variant counts-1.png
    │   │   │   ├── genotype heatmap-1.png
    │   │   │   ├── variant type counts-1.png
    │   │   │   └── pgp variant type counts-1.png
    │   ├── issues-with-the-variant-centric-approach
    │   │   └── .gitignore
    │   └── README.md
    ├── figure
    │   └── gender-1.png
    ├── sql
    │   ├── schema-comparisons
    │   │   ├── record-sample-counts.sql
    │   │   ├── missingness-klotho.sql
    │   │   ├── call-counts.sql
    │   │   ├── klotho-gvcf-expanded.sql
    │   │   ├── klotho-gvcf.sql
    │   │   ├── sample-call-counts.sql
    │   │   ├── missingness-brca1.sql
    │   │   └── missingness-udf.sql
    │   ├── comparing-pgp-to-1000genomes
    │   │   ├── genotype-counts.sql
    │   │   ├── sample-counts-minmax-by-chromosome.sql
    │   │   ├── variant-counts-by-chromosome.sql
    │   │   ├── parsed-genotype-counts.sql
    │   │   ├── taking-a-closer-look-at-variant-types.sql
    │   │   └── variant-counts-by-type-and-chromosome.sql
    │   ├── cgi_variants
    │   │   ├── klotho.sql
    │   │   ├── allelic-frequency-comparison.sql
    │   │   ├── allele-count.sql
    │   │   ├── ti-tv-ratio.sql
    │   │   ├── allelic-frequency.py
    │   │   ├── allelic-frequency-chr1.sql
    │   │   └── allelic-frequency-brca1.sql
    │   ├── gender-count.sql
    │   ├── issues-with-the-variant-centric-approach
    │   │   ├── klotho-summary.sql
    │   │   ├── factor-v-leiden.sql
    │   │   └── factor-v-leiden-summary.sql
    │   ├── gvcf_variants_expanded
    │   │   ├── klotho.sql
    │   │   ├── ti-tv-ratio.sql
    │   │   └── allelic-frequency.sql
    │   └── gvcf_variants
    │   │   ├── klotho.sql
    │   │   ├── allelic-frequency-comparison.sql
    │   │   ├── allelic-frequency.py
    │   │   ├── allele-count.sql
    │   │   ├── ti-tv-ratio.sql
    │   │   ├── allelic-frequency-brca1-no-udf.sql
    │   │   ├── allelic-frequency-chr1.sql
    │   │   └── allelic-frequency-brca1.sql
    ├── provenance
    │   ├── gvcf-expand-mapper.py
    │   ├── gvcf-expand-reducer.py
    │   ├── cgi-header-mapper.py
    │   ├── cgi-mapper.py
    │   └── cgi-ref-blocks-mapper.py
    ├── README.md
    └── README.Rmd
├── 1000genomes
    ├── .gitignore
    ├── data-stories
    │   ├── exploring-the-variant-data
    │   │   ├── .gitignore
    │   │   └── figure
    │   │   │   ├── unnamed-chunk-11-1.png
    │   │   │   ├── unnamed-chunk-13-1.png
    │   │   │   ├── unnamed-chunk-15-1.png
    │   │   │   ├── unnamed-chunk-16-1.png
    │   │   │   ├── unnamed-chunk-3-1.png
    │   │   │   ├── unnamed-chunk-5-1.png
    │   │   │   ├── unnamed-chunk-7-1.png
    │   │   │   └── unnamed-chunk-9-1.png
    │   ├── exploring-the-phenotypic-data
    │   │   ├── .gitignore
    │   │   └── figure
    │   │   │   ├── families.png
    │   │   │   ├── gender.png
    │   │   │   ├── samples.png
    │   │   │   ├── superpop.png
    │   │   │   ├── ethnicity.png
    │   │   │   └── ethnicity and gender.png
    │   ├── reproducing-hardy-weinberg-equilibrium
    │   │   ├── .gitignore
    │   │   └── README.Rmd
    │   ├── reproducing-vcfstats
    │   │   └── vcfstats-output
    │   │   │   ├── stats.qual-tstv
    │   │   │   ├── stats.tstv
    │   │   │   ├── stats.legend
    │   │   │   ├── stats.shared
    │   │   │   └── stats.private
    │   ├── reproducing-allelic-frequencies
    │   │   ├── figure
    │   │   │   ├── maf.png
    │   │   │   ├── all variants.png
    │   │   │   ├── viz maf no X.png
    │   │   │   └── common variants by gender.png
    │   │   └── README.Rmd
    │   └── README.md
    ├── figure
    │   ├── dbSNP Variants-1.png
    │   ├── shared Variants-1.png
    │   ├── shared variants by pop-1.png
    │   ├── shared common variants by pop-1.png
    │   ├── shared rare variants by pop-1.png
    │   ├── shared rare variants by percent pop-1.png
    │   └── shared common variants by percent pop-1.png
    ├── sql
    │   ├── variant-counts-by-type.sql
    │   ├── phenotype_sql
    │   │   ├── num-samples.sql
    │   │   ├── family-sizes.sql
    │   │   ├── gender-ratio.sql
    │   │   ├── ethnicity-by-gender-ratio.sql
    │   │   ├── ethnicity-by-superpop-ratio.sql
    │   │   └── ethnicity-ratio.sql
    │   ├── reproducing-vcfstats
    │   │   ├── variant-count-brca1.sql
    │   │   ├── variant-counts-by-type-brca1.sql
    │   │   ├── snp-variant-counts-brca1.sql
    │   │   ├── sample-snp-counts-brca1.sql
    │   │   ├── sample-indel-counts-brca1.sql
    │   │   ├── shared-variant-counts-brca1.sql
    │   │   ├── indel-length-counts-brca1.sql
    │   │   ├── private-variant-counts-brca1.sql
    │   │   ├── variant-sample-counts-brca1.sql
    │   │   └── ti-tv-ratio-brca1.sql
    │   ├── variant-counts-by-type-and-chromosome.sql
    │   ├── snp-variant-counts.sql
    │   ├── understanding-alternate-alleles
    │   │   ├── chrom-pos-ref-dups.sql
    │   │   ├── minimal-unique-key.sql
    │   │   ├── three-chrom-pos-ref-dups.sql
    │   │   ├── count-chrom-pos-ref.sql
    │   │   ├── unique-key.sql
    │   │   ├── not-quite-unique-key.sql
    │   │   ├── sample-likelihood.sql
    │   │   ├── count-by-var-type-chrom-pos-ref-dups.sql
    │   │   ├── count-by-var-type-chrom-pos-ref-singles.sql
    │   │   └── sample-chrom-pos-ref-dups.sql
    │   ├── ratio-of-variants-by-type.sql
    │   ├── variant-level-data-for-brca1.sql
    │   ├── ratio-of-dbsnp-variants-by-chromosome.sql
    │   ├── indel-length-counts.sql
    │   ├── private-variant-counts.sql
    │   ├── shared-variant-counts.sql
    │   ├── sample-variant-counts-by-type-and-chromosome.sql
    │   ├── sample-level-data-for-brca1.sql
    │   ├── ti-tv-ratio.sql
    │   ├── heterozygous-homozygous-ratio.sql
    │   ├── minimum-allelic-frequency-by-ethnicity.sql
    │   ├── variant-hotspots.sql
    │   ├── sample-variant-hotspots.sql
    │   ├── minimum-allelic-frequency-by-ethnicity-no-sex-chromosomes.sql
    │   ├── allelic-frequency.sql
    │   ├── reproducing-allelic-frequencies
    │   │   ├── reproducing-allelic-frequency.sql
    │   │   └── reproducing-allelic-frequency-by-ethnicity.sql
    │   ├── allelic-frequency-by-gender.sql
    │   ├── allelic-frequency-by-ethnicity.sql
    │   ├── gender-het-hom-ratio.sql
    │   ├── shared-variant-counts-by-ethnicity.sql
    │   ├── gwas-pattern-two-proportion-z-test.sql
    │   ├── gwas-pattern-chi-squared-test.sql
    │   └── hardy-weinberg-equilibrium.sql
    └── provenance
    │   └── README.md
├── .gitignore
├── platinumGenomes
    ├── figure
    │   └── function-1.png
    ├── sql
    │   ├── sample-snps-by-exonic-function.sql
    │   ├── cohort-rare-pathenogenic-snps.sql
    │   └── sample-rare-pathenogenic-snps.sql
    └── README.Rmd
├── 1000genomes_phase3
    ├── figure
    │   ├── titv_metrics-1.png
    │   ├── titv_metrics-2.png
    │   ├── hethom_metrics-1.png
    │   ├── hethom_metrics-2.png
    │   ├── indel_metrics-1.png
    │   └── indel_metrics-2.png
    ├── README.Rmd
    └── sql
    │   └── qc-metrics.sql
├── annotations
    └── README.md
├── README.md
├── CONTRIBUTING.rst
└── sgdp
    └── provenance
        └── wrangle-simons-sample-attributes.R


/pgp/.gitignore:
--------------------------------------------------------------------------------
1 | .httr-oauth
2 | 


--------------------------------------------------------------------------------
/1000genomes/.gitignore:
--------------------------------------------------------------------------------
1 | .httr-oauth
2 | 


--------------------------------------------------------------------------------
/pgp/data-stories/schema-comparisons/.gitignore:
--------------------------------------------------------------------------------
1 | .httr-oauth
2 | 


--------------------------------------------------------------------------------
/pgp/data-stories/comparing-pgp-to-1000genomes/.gitignore:
--------------------------------------------------------------------------------
1 | .httr-oauth
2 | 


--------------------------------------------------------------------------------
/1000genomes/data-stories/exploring-the-variant-data/.gitignore:
--------------------------------------------------------------------------------
1 | .httr-oauth
2 | 


--------------------------------------------------------------------------------
/1000genomes/data-stories/exploring-the-phenotypic-data/.gitignore:
--------------------------------------------------------------------------------
1 | .httr-oauth
2 | 


--------------------------------------------------------------------------------
/pgp/data-stories/issues-with-the-variant-centric-approach/.gitignore:
--------------------------------------------------------------------------------
1 | .httr-oauth
2 | 


--------------------------------------------------------------------------------
/1000genomes/data-stories/reproducing-hardy-weinberg-equilibrium/.gitignore:
--------------------------------------------------------------------------------
1 | .httr-oauth
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | cache
2 | *.Rproj
3 | .Rproj.user
4 | .Rhistory
5 | .RData
6 | *html
7 | .httr-oauth
8 | 


--------------------------------------------------------------------------------
/pgp/figure/gender-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/figure/gender-1.png


--------------------------------------------------------------------------------
/1000genomes/data-stories/reproducing-vcfstats/vcfstats-output/stats.qual-tstv:
--------------------------------------------------------------------------------
1 | #Quality	Marginal count	Marginal Ts/Tv
2 | 


--------------------------------------------------------------------------------
/1000genomes/data-stories/reproducing-vcfstats/vcfstats-output/stats.tstv:
--------------------------------------------------------------------------------
1 | #Transitions	Transversions	ts/tv	Sample
2 | 615	228	2.70	all
3 | 


--------------------------------------------------------------------------------
/1000genomes/figure/dbSNP Variants-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/figure/dbSNP Variants-1.png


--------------------------------------------------------------------------------
/platinumGenomes/figure/function-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/platinumGenomes/figure/function-1.png


--------------------------------------------------------------------------------
/1000genomes/figure/shared Variants-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/figure/shared Variants-1.png


--------------------------------------------------------------------------------
/1000genomes_phase3/figure/titv_metrics-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes_phase3/figure/titv_metrics-1.png


--------------------------------------------------------------------------------
/1000genomes_phase3/figure/titv_metrics-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes_phase3/figure/titv_metrics-2.png


--------------------------------------------------------------------------------
/1000genomes_phase3/figure/hethom_metrics-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes_phase3/figure/hethom_metrics-1.png


--------------------------------------------------------------------------------
/1000genomes_phase3/figure/hethom_metrics-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes_phase3/figure/hethom_metrics-2.png


--------------------------------------------------------------------------------
/1000genomes_phase3/figure/indel_metrics-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes_phase3/figure/indel_metrics-1.png


--------------------------------------------------------------------------------
/1000genomes_phase3/figure/indel_metrics-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes_phase3/figure/indel_metrics-2.png


--------------------------------------------------------------------------------
/1000genomes/figure/shared variants by pop-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/figure/shared variants by pop-1.png


--------------------------------------------------------------------------------
/1000genomes/figure/shared common variants by pop-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/figure/shared common variants by pop-1.png


--------------------------------------------------------------------------------
/1000genomes/figure/shared rare variants by pop-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/figure/shared rare variants by pop-1.png


--------------------------------------------------------------------------------
/pgp/data-stories/schema-comparisons/figure/call_cnt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/schema-comparisons/figure/call_cnt.png


--------------------------------------------------------------------------------
/pgp/data-stories/schema-comparisons/figure/variant_cnt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/schema-comparisons/figure/variant_cnt.png


--------------------------------------------------------------------------------
/1000genomes/figure/shared rare variants by percent pop-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/figure/shared rare variants by percent pop-1.png


--------------------------------------------------------------------------------
/1000genomes/figure/shared common variants by percent pop-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/figure/shared common variants by percent pop-1.png


--------------------------------------------------------------------------------
/pgp/data-stories/schema-comparisons/figure/sample_call_cnt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/schema-comparisons/figure/sample_call_cnt.png


--------------------------------------------------------------------------------
/pgp/data-stories/schema-comparisons/figure/sample_variant_cnt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/schema-comparisons/figure/sample_variant_cnt.png


--------------------------------------------------------------------------------
/1000genomes/data-stories/reproducing-allelic-frequencies/figure/maf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/reproducing-allelic-frequencies/figure/maf.png


--------------------------------------------------------------------------------
/1000genomes/data-stories/exploring-the-phenotypic-data/figure/families.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-phenotypic-data/figure/families.png


--------------------------------------------------------------------------------
/1000genomes/data-stories/exploring-the-phenotypic-data/figure/gender.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-phenotypic-data/figure/gender.png


--------------------------------------------------------------------------------
/1000genomes/data-stories/exploring-the-phenotypic-data/figure/samples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-phenotypic-data/figure/samples.png


--------------------------------------------------------------------------------
/1000genomes/data-stories/exploring-the-phenotypic-data/figure/superpop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-phenotypic-data/figure/superpop.png


--------------------------------------------------------------------------------
/pgp/data-stories/comparing-pgp-to-1000genomes/figure/variant counts-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/comparing-pgp-to-1000genomes/figure/variant counts-1.png


--------------------------------------------------------------------------------
/1000genomes/data-stories/exploring-the-phenotypic-data/figure/ethnicity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-phenotypic-data/figure/ethnicity.png


--------------------------------------------------------------------------------
/pgp/data-stories/comparing-pgp-to-1000genomes/figure/genotype heatmap-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/comparing-pgp-to-1000genomes/figure/genotype heatmap-1.png


--------------------------------------------------------------------------------
/pgp/data-stories/comparing-pgp-to-1000genomes/figure/variant type counts-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/comparing-pgp-to-1000genomes/figure/variant type counts-1.png


--------------------------------------------------------------------------------
/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-11-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-11-1.png


--------------------------------------------------------------------------------
/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-13-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-13-1.png


--------------------------------------------------------------------------------
/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-15-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-15-1.png


--------------------------------------------------------------------------------
/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-16-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-16-1.png


--------------------------------------------------------------------------------
/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-7-1.png


--------------------------------------------------------------------------------
/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-9-1.png


--------------------------------------------------------------------------------
/1000genomes/data-stories/reproducing-allelic-frequencies/figure/all variants.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/reproducing-allelic-frequencies/figure/all variants.png


--------------------------------------------------------------------------------
/1000genomes/data-stories/reproducing-allelic-frequencies/figure/viz maf no X.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/reproducing-allelic-frequencies/figure/viz maf no X.png


--------------------------------------------------------------------------------
/pgp/data-stories/comparing-pgp-to-1000genomes/figure/pgp variant type counts-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/comparing-pgp-to-1000genomes/figure/pgp variant type counts-1.png


--------------------------------------------------------------------------------
/1000genomes/data-stories/exploring-the-phenotypic-data/figure/ethnicity and gender.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-phenotypic-data/figure/ethnicity and gender.png


--------------------------------------------------------------------------------
/1000genomes/data-stories/reproducing-allelic-frequencies/figure/common variants by gender.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/reproducing-allelic-frequencies/figure/common variants by gender.png


--------------------------------------------------------------------------------
/1000genomes/sql/variant-counts-by-type.sql:
--------------------------------------------------------------------------------
 1 | # Count the number of variants across the entirety of 1,000 Genomes by variant type.
 2 | SELECT
 3 |   vt,
 4 |   COUNT(vt) as cnt,
 5 | FROM
 6 |   [genomics-public-data:1000_genomes.variants]
 7 | GROUP BY
 8 |   vt
 9 | ORDER BY
10 |   vt
11 | 


--------------------------------------------------------------------------------
/1000genomes/sql/phenotype_sql/num-samples.sql:
--------------------------------------------------------------------------------
1 | # Count the number of samples in the phenotypic data
2 | SELECT
3 |   COUNT(sample) AS all_samples,
4 |   SUM(IF(In_Phase1_Integrated_Variant_Set = TRUE, 1, 0)) AS samples_in_variants_table
5 | FROM
6 |   [genomics-public-data:1000_genomes.sample_info]
7 | 


--------------------------------------------------------------------------------
/1000genomes/sql/reproducing-vcfstats/variant-count-brca1.sql:
--------------------------------------------------------------------------------
1 | # Count the number of variants in BRCA1
2 | SELECT
3 |   count(reference_name) as num_variants,
4 | FROM
5 |   [genomics-public-data:1000_genomes.variants]
6 | WHERE
7 |   reference_name = '17'
8 |   AND start BETWEEN 41196311
9 |   AND 41277499


--------------------------------------------------------------------------------
/1000genomes/sql/reproducing-vcfstats/variant-counts-by-type-brca1.sql:
--------------------------------------------------------------------------------
 1 | # Count the number of variants by type in BRCA1.
 2 | SELECT
 3 |   vt AS variant_type,
 4 |   COUNT(vt) AS num_variants_of_type,
 5 | FROM
 6 |   [genomics-public-data:1000_genomes.variants]
 7 | WHERE
 8 |   reference_name = '17'
 9 |   AND start BETWEEN 41196311
10 |   AND 41277499
11 | GROUP BY
12 |   variant_type


--------------------------------------------------------------------------------
/pgp/sql/schema-comparisons/record-sample-counts.sql:
--------------------------------------------------------------------------------
 1 | # Confirm that we are correctly expanding reference-matching blocks into our variants.
 2 | SELECT
 3 |   MAX(num_sample_ids) as max_samples_per_record,
 4 | FROM (
 5 |   SELECT
 6 |     COUNT(call.callset_name) WITHIN RECORD AS num_sample_ids,
 7 |   FROM
 8 |     [google.com:biggene:test.pgp_gvcf_variants_expanded2]
 9 |     )
10 | 


--------------------------------------------------------------------------------
/1000genomes/sql/variant-counts-by-type-and-chromosome.sql:
--------------------------------------------------------------------------------
 1 | # Count the number of variants across the entirety of 1,000 Genomes by variant type and
 2 | # chromosome.
 3 | SELECT
 4 |   reference_name,
 5 |   vt,
 6 |   COUNT(vt) AS cnt,
 7 | FROM
 8 |   [genomics-public-data:1000_genomes.variants]
 9 | GROUP BY
10 |   reference_name,
11 |   vt
12 | ORDER BY
13 |   reference_name,
14 |   vt
15 | 


--------------------------------------------------------------------------------
/1000genomes/sql/snp-variant-counts.sql:
--------------------------------------------------------------------------------
 1 | # Count SNPs by base pair transition across the dataset
 2 | SELECT
 3 |   reference_bases,
 4 |   alternate_bases AS allele,
 5 |   COUNT(alternate_bases) AS num_snps
 6 | FROM
 7 |   [genomics-public-data:1000_genomes.variants]
 8 | WHERE
 9 |   vt ='SNP'
10 | GROUP BY
11 |   reference_bases,
12 |   allele
13 | ORDER BY
14 |   reference_bases,
15 |   allele
16 | 


--------------------------------------------------------------------------------
/pgp/sql/comparing-pgp-to-1000genomes/genotype-counts.sql:
--------------------------------------------------------------------------------
 1 | # Count the number of genotypes for all individuals in the dataset.
 2 | SELECT
 3 |   genotype,
 4 |   COUNT(genotype) AS cnt,
 5 | FROM (
 6 |   SELECT
 7 |     GROUP_CONCAT(STRING(call.genotype)) WITHIN call AS genotype,
 8 |   FROM
 9 |     [google.com:biggene:pgp_20150205.genome_calls])
10 | GROUP BY
11 |   genotype
12 | ORDER BY
13 |   cnt DESC
14 | 


--------------------------------------------------------------------------------
/1000genomes/data-stories/reproducing-vcfstats/vcfstats-output/stats.legend:
--------------------------------------------------------------------------------
 1 | 
 2 | count
 3 |     Number of positions with known genotype
 4 | 
 5 | nalt_X
 6 |     Number of monoallelic (X=0), biallelic (X=1), etc. sites
 7 |     
 8 | ref, ref_count
 9 |     Number of sites containing reference allele
10 | 
11 | shared
12 |     Number of sites having a non-reference allele in 0,1,2,etc samples
13 | 
14 | snp_count
15 |     Number of positions with SNPs
16 |     


--------------------------------------------------------------------------------
/1000genomes/sql/phenotype_sql/family-sizes.sql:
--------------------------------------------------------------------------------
 1 | # Compute the distribution of family sizes
 2 | SELECT
 3 | num_family_members AS family_size,
 4 | COUNT(num_family_members) AS num_families_of_size
 5 | FROM (
 6 |   SELECT
 7 |   family_id,
 8 |   COUNT(family_id) AS num_family_members,
 9 |   FROM
10 |   [genomics-public-data:1000_genomes.sample_info]
11 |   WHERE
12 |   In_Phase1_Integrated_Variant_Set = TRUE
13 |   GROUP BY
14 |   family_id)
15 | GROUP BY
16 | family_size


--------------------------------------------------------------------------------
/1000genomes/sql/phenotype_sql/gender-ratio.sql:
--------------------------------------------------------------------------------
 1 | # Compute sample count and ratio by gender
 2 | SELECT
 3 |   gender,
 4 |   gender_count,
 5 |   RATIO_TO_REPORT(gender_count)
 6 | OVER
 7 |   (
 8 |   ORDER BY
 9 |     gender_count) AS gender_ratio
10 | FROM (
11 |   SELECT
12 |     gender,
13 |     COUNT(gender) AS gender_count,
14 |   FROM
15 |     [genomics-public-data:1000_genomes.sample_info]
16 |   WHERE
17 |     In_Phase1_Integrated_Variant_Set = TRUE
18 |   GROUP BY
19 |     gender)


--------------------------------------------------------------------------------
/pgp/sql/cgi_variants/klotho.sql:
--------------------------------------------------------------------------------
 1 | # Sample level data for Klotho variant rs9536314 for use in the "amazing
 2 | # intelligence of PGP participants" data story. 
 3 | SELECT
 4 |   sample_id,
 5 |   chromosome,
 6 |   locusBegin,
 7 |   locusEnd,
 8 |   reference,
 9 |   allele1Seq,
10 |   allele2Seq,
11 | FROM
12 |   [google.com:biggene:pgp.cgi_variants]
13 | WHERE
14 |   chromosome = "chr13"
15 |   AND locusBegin <= 33628137
16 |   AND locusEnd >= 33628138
17 | ORDER BY
18 |   sample_id
19 | 


--------------------------------------------------------------------------------
/pgp/sql/gender-count.sql:
--------------------------------------------------------------------------------
 1 | # Compute sample count by gender
 2 | SELECT
 3 |   Sex_Gender,
 4 |   COUNT(1) AS cnt
 5 | FROM
 6 |   (
 7 |   SELECT
 8 |     call.callset_name,
 9 |     Sex_Gender
10 |   FROM
11 |     FLATTEN([google.com:biggene:pgp.variants],
12 |       call) AS var
13 |   JOIN
14 |     [google.com:biggene:pgp.phenotypes] AS pheno
15 |   ON
16 |     pheno.Participant = var.call.callset_name
17 |   GROUP BY
18 |     call.callset_name,
19 |     Sex_Gender)
20 | GROUP BY
21 |   Sex_Gender


--------------------------------------------------------------------------------
/1000genomes/sql/reproducing-vcfstats/snp-variant-counts-brca1.sql:
--------------------------------------------------------------------------------
 1 | # Count SNPs by base pair transition across BRCA1.
 2 | SELECT
 3 |   reference_bases,
 4 |   alternate_bases AS allele,
 5 |   COUNT(alternate_bases) AS num_snps
 6 | FROM
 7 |   [genomics-public-data:1000_genomes.variants]
 8 | WHERE
 9 |   reference_name = '17'
10 |   AND start BETWEEN 41196311
11 |   AND 41277499
12 |   AND vt ='SNP'
13 | GROUP BY
14 |   reference_bases,
15 |   allele
16 | ORDER BY
17 |   reference_bases,
18 |   allele


--------------------------------------------------------------------------------
/1000genomes/sql/understanding-alternate-alleles/chrom-pos-ref-dups.sql:
--------------------------------------------------------------------------------
 1 | # Find variants on chromosome 17 that reside on the same start with the same reference base
 2 | SELECT
 3 |   reference_name,
 4 |   start,
 5 |   reference_bases,
 6 |   COUNT(start) AS num_alternates
 7 | FROM
 8 |   [genomics-public-data:1000_genomes.variants]
 9 | WHERE
10 |   reference_name = '17'
11 | GROUP BY
12 |   reference_name,
13 |   start,
14 |   reference_bases
15 | HAVING
16 |   num_alternates > 1
17 | ORDER BY
18 |   reference_name,
19 |   start,
20 |   reference_bases


--------------------------------------------------------------------------------
/1000genomes/sql/reproducing-vcfstats/sample-snp-counts-brca1.sql:
--------------------------------------------------------------------------------
 1 | # Sample SNP counts for BRCA1.
 2 | SELECT
 3 |   COUNT(sample_id) AS variant_count,
 4 |   sample_id
 5 | FROM (
 6 |   SELECT
 7 |     reference_name,
 8 |     start,
 9 |     reference_bases,
10 |     call.call_set_name AS sample_id
11 |   FROM
12 |     [genomics-public-data:1000_genomes.variants]
13 |   WHERE
14 |     reference_name = '17'
15 |     AND start BETWEEN 41196311
16 |     AND 41277499
17 |     AND vt ='SNP'
18 |     AND (0 < call.genotype)
19 |     )
20 | GROUP BY
21 |   sample_id
22 | ORDER BY
23 |   sample_id
24 | 


--------------------------------------------------------------------------------
/1000genomes/sql/understanding-alternate-alleles/minimal-unique-key.sql:
--------------------------------------------------------------------------------
 1 | # This query demonstrates the minimal set of fields needed to  
 2 | # comprise a unique key for the rows in the table.
 3 | SELECT
 4 |   reference_name,
 5 |   start,
 6 |   alt,
 7 |   end,
 8 |   COUNT(1) AS cnt
 9 | FROM (
10 |   SELECT
11 |     reference_name,
12 |     start,
13 |     GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt,
14 |     end,
15 |   FROM
16 |     [genomics-public-data:1000_genomes.variants])
17 |   GROUP EACH BY
18 |   reference_name,
19 |   start,
20 |   alt,
21 |   end
22 | HAVING
23 |   cnt > 1


--------------------------------------------------------------------------------
/1000genomes/sql/understanding-alternate-alleles/three-chrom-pos-ref-dups.sql:
--------------------------------------------------------------------------------
 1 | # Get three particular start on chromosome 17 that have alternate variants.
 2 | SELECT
 3 |   reference_name,
 4 |   start,
 5 |   reference_bases,
 6 |   GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt,
 7 |   GROUP_CONCAT(names) WITHIN RECORD AS names,
 8 |   vt,
 9 | FROM
10 |   [genomics-public-data:1000_genomes.variants]
11 | WHERE
12 |   reference_name = '17'
13 |   AND (start = 48515942
14 |     OR start = 48570613
15 |     OR start = 48659342)
16 | ORDER BY
17 |   start,
18 |   reference_bases,
19 |   alt
20 | 


--------------------------------------------------------------------------------
/pgp/sql/issues-with-the-variant-centric-approach/klotho-summary.sql:
--------------------------------------------------------------------------------
 1 | # Sample counts for Klotho variant rs9536314 for use in the "amazing
 2 | # intelligence of PGP participants" data story. 
 3 | SELECT
 4 |   COUNT(sample_id) AS sample_counts,
 5 |   chromosome,
 6 |   reference,
 7 |   allele1Seq,
 8 |   allele2Seq,
 9 | FROM
10 |   [google.com:biggene:pgp.cgi_variants]
11 | WHERE
12 |   chromosome = "chr13"
13 |   AND locusBegin <= 33628137
14 |   AND locusEnd >= 33628138
15 | GROUP BY
16 |   chromosome,
17 |   reference,
18 |   allele1Seq,
19 |   allele2Seq
20 | ORDER BY
21 |   sample_counts DESC
22 | 


--------------------------------------------------------------------------------
/1000genomes/sql/understanding-alternate-alleles/count-chrom-pos-ref.sql:
--------------------------------------------------------------------------------
 1 | # Count number of alternate variants on chromosome 17 for the same start and
 2 | # reference base
 3 | SELECT
 4 |   num_alternates,
 5 |   COUNT(num_alternates) AS num_records
 6 | FROM (
 7 |   SELECT
 8 |     reference_name,
 9 |     start,
10 |     reference_bases,
11 |     COUNT(start) AS num_alternates,
12 |   FROM
13 |     [genomics-public-data:1000_genomes.variants]
14 |   WHERE
15 |     reference_name = '17'
16 |   GROUP BY
17 |     reference_name,
18 |     start,
19 |     reference_bases)
20 | GROUP BY
21 |   num_alternates
22 | 


--------------------------------------------------------------------------------
/1000genomes/sql/ratio-of-variants-by-type.sql:
--------------------------------------------------------------------------------
 1 | # Compute the ratios of variants by type for each chromosome.
 2 | SELECT
 3 |   reference_name,
 4 |   vt AS variant_type,
 5 |   RATIO_TO_REPORT(variant_count)
 6 | OVER
 7 |   (
 8 |   PARTITION BY
 9 |     reference_name
10 |   ORDER BY
11 |     variant_count DESC) ratio_of_variants_of_type_for_reference_name,
12 | FROM (
13 |   SELECT
14 |     reference_name,
15 |     vt,
16 |     COUNT(vt) AS variant_count
17 |   FROM
18 |     [genomics-public-data:1000_genomes.variants]
19 |   GROUP BY
20 |     reference_name,
21 |     vt
22 |   ORDER BY
23 |     reference_name,
24 |     vt)
25 | 


--------------------------------------------------------------------------------
/1000genomes/sql/phenotype_sql/ethnicity-by-gender-ratio.sql:
--------------------------------------------------------------------------------
 1 | # Ratios of ethnicities grouped by gender
 2 | SELECT
 3 |   population,
 4 |   gender,
 5 |   population_count,
 6 |   RATIO_TO_REPORT(population_count) OVER(
 7 |   PARTITION BY
 8 |     population
 9 |   ORDER BY
10 |     gender)
11 |   AS population_ratio
12 | from(
13 |   SELECT
14 |     gender,
15 |     population,
16 |     COUNT(population) AS population_count,
17 |   FROM
18 |     [genomics-public-data:1000_genomes.sample_info]
19 |   WHERE
20 |     In_Phase1_Integrated_Variant_Set = TRUE
21 |   GROUP BY
22 |     gender,
23 |     population)
24 | ORDER BY
25 |   population,
26 |   gender
27 | 


--------------------------------------------------------------------------------
/1000genomes/sql/variant-level-data-for-brca1.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | --
 3 | -- Retrieve variant-level information for BRCA1 variants.
 4 | --
 5 | SELECT
 6 |   reference_name,
 7 |   start,
 8 |   `end`,
 9 |   reference_bases,
10 |   ARRAY_TO_STRING(v.alternate_bases, ',') AS alts,
11 |   quality,
12 |   ARRAY_TO_STRING(v.filter, ',') AS filter,
13 |   ARRAY_TO_STRING(v.names, ',') AS names,
14 |   vt,
15 |   ARRAY_LENGTH(v.call) AS num_samples
16 | FROM
17 |   `genomics-public-data.1000_genomes.variants` v
18 | WHERE
19 |   reference_name IN ('17', 'chr17')
20 |   AND start BETWEEN 41196311 AND 41277499 # per GRCh37
21 | ORDER BY
22 |   start,
23 |   alts
24 | 


--------------------------------------------------------------------------------
/pgp/sql/comparing-pgp-to-1000genomes/sample-counts-minmax-by-chromosome.sql:
--------------------------------------------------------------------------------
 1 | # Summarize the minimum and maximum number of samples per variant by chromosome.
 2 | SELECT
 3 |   reference_name,
 4 |   MIN(sample_count) AS minimum_sample_count,
 5 |   MAX(sample_count) AS maximum_sample_count,
 6 | FROM (
 7 |   SELECT
 8 |     reference_name,
 9 |     COUNT(call.call_set_name) WITHIN RECORD AS sample_count
10 |   FROM
11 |     [google.com:biggene:pgp_20150205.genome_calls]
12 |   # The source data was Complete Genomics which includes non-variant segments.
13 |   OMIT RECORD IF EVERY(alternate_bases IS NULL))
14 | GROUP BY
15 |   reference_name
16 | ORDER BY
17 |   reference_name
18 | 


--------------------------------------------------------------------------------
/1000genomes/sql/phenotype_sql/ethnicity-by-superpop-ratio.sql:
--------------------------------------------------------------------------------
 1 | # Ratios of ethnicities grouped by super population
 2 | SELECT
 3 |   super_population,
 4 |   super_population_description,
 5 |   super_population_count,
 6 |   RATIO_TO_REPORT(super_population_count)
 7 | OVER
 8 |   (
 9 |   ORDER BY
10 |     super_population_count) AS super_population_ratio
11 | from(
12 |   SELECT
13 |     super_population,
14 |     super_population_description,
15 |     COUNT(population) AS super_population_count,
16 |   FROM
17 |     [genomics-public-data:1000_genomes.sample_info]
18 |   WHERE
19 |     In_Phase1_Integrated_Variant_Set = TRUE
20 |   GROUP BY
21 |     super_population,
22 |     super_population_description)


--------------------------------------------------------------------------------
/1000genomes/sql/reproducing-vcfstats/sample-indel-counts-brca1.sql:
--------------------------------------------------------------------------------
 1 | # Sample INDEL counts for BRCA1.
 2 | SELECT
 3 |   COUNT(sample_id) AS variant_count,
 4 |   sample_id,
 5 | FROM (
 6 |   SELECT
 7 |     call.call_set_name AS sample_id,
 8 |     NTH(1,
 9 |       call.genotype) WITHIN call AS first_allele,
10 |     NTH(2,
11 |       call.genotype) WITHIN call AS second_allele,
12 |   FROM
13 |     [genomics-public-data:1000_genomes.variants]
14 |   WHERE
15 |     reference_name = '17'
16 |     AND start BETWEEN 41196311
17 |     AND 41277499
18 |     AND vt ='INDEL'
19 |   HAVING
20 |     0 < first_allele
21 |     OR 0 < second_allele)
22 | GROUP BY
23 |   sample_id
24 | ORDER BY
25 |   sample_id
26 | 


--------------------------------------------------------------------------------
/1000genomes/sql/understanding-alternate-alleles/unique-key.sql:
--------------------------------------------------------------------------------
 1 | # This query demonstrates that an additional field, 'end', is needed to  
 2 | # comprise a unique key for the rows in the table.
 3 | SELECT
 4 |   reference_name,
 5 |   start,
 6 |   reference_bases,
 7 |   alt,
 8 |   vt,
 9 |   end,
10 |   COUNT(1) AS cnt
11 | FROM (
12 |   SELECT
13 |     reference_name,
14 |     start,
15 |     reference_bases,
16 |     GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt,
17 |     vt,
18 |     end,
19 |   FROM
20 |     [genomics-public-data:1000_genomes.variants])
21 |   GROUP EACH BY
22 |   reference_name,
23 |   start,
24 |   reference_bases,
25 |   alt,
26 |   vt,
27 |   end
28 | HAVING
29 |   cnt > 1


--------------------------------------------------------------------------------
/1000genomes/sql/understanding-alternate-alleles/not-quite-unique-key.sql:
--------------------------------------------------------------------------------
 1 | # This query demonstrates that some additional field is needed to
 2 | # comprise a unique key for the rows in the table.
 3 | SELECT
 4 |   reference_name,
 5 |   start,
 6 |   reference_bases,
 7 |   alt,
 8 |   vt,
 9 |   COUNT(1) AS cnt
10 | FROM (
11 |   SELECT
12 |     reference_name,
13 |     start,
14 |     reference_bases,
15 |     GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt,
16 |     vt,
17 |   FROM
18 |     [genomics-public-data:1000_genomes.variants])
19 |   GROUP EACH BY
20 |   reference_name,
21 |   start,
22 |   reference_bases,
23 |   alt,
24 |   vt
25 | HAVING
26 |   cnt > 1
27 | ORDER BY
28 |   reference_name
29 | 


--------------------------------------------------------------------------------
/1000genomes/sql/ratio-of-dbsnp-variants-by-chromosome.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | --
 3 | -- Get the proportion of variants (per chromosome) in the dataset
 4 | -- that have been reported in the dbSNP database (version 132).
 5 | --
 6 | WITH
 7 |   counts AS (
 8 |   SELECT
 9 |     reference_name,
10 |     COUNT(1) AS num_variants,
11 |     COUNTIF(ARRAY_LENGTH(names) > 0) AS num_dbsnp_variants
12 |   FROM
13 |     `genomics-public-data.1000_genomes.variants`
14 |   GROUP BY
15 |     reference_name )
16 |   --
17 |   -- Compute the ratio.
18 | SELECT
19 |   reference_name,
20 |   num_dbsnp_variants,
21 |   num_variants,
22 |   num_dbsnp_variants / num_variants AS frequency
23 | FROM
24 |   counts
25 | ORDER BY
26 |   num_variants DESC
27 | 


--------------------------------------------------------------------------------
/1000genomes/sql/indel-length-counts.sql:
--------------------------------------------------------------------------------
 1 | # Count the number of INDELs differing from the reference allele by particular lengths
 2 | SELECT
 3 |   length_difference,
 4 |   COUNT(length_difference) AS count_of_indels_with_length_difference,
 5 | FROM (
 6 |   SELECT
 7 |     reference_name,
 8 |     start,
 9 |     reference_bases,
10 |     LENGTH(reference_bases) AS ref_length,
11 |     alternate_bases AS allele,
12 |     LENGTH(alternate_bases) AS allele_length,
13 |     (LENGTH(alternate_bases) - LENGTH(reference_bases)) AS length_difference,
14 |     FROM
15 |       [genomics-public-data:1000_genomes.variants]
16 |     WHERE
17 |       vt ='INDEL'
18 |     )
19 | GROUP BY
20 |   length_difference
21 | ORDER BY
22 |   length_difference
23 | 


--------------------------------------------------------------------------------
/1000genomes/sql/private-variant-counts.sql:
--------------------------------------------------------------------------------
 1 | # Compute the number of variants for a particular sample that are shared by
 2 | # no other samples.
 3 | SELECT
 4 |   COUNT(sample_id) AS private_variants_count,
 5 |   sample_id
 6 | FROM
 7 |   (
 8 |   SELECT
 9 |     reference_name,
10 |     start,
11 |     reference_bases,
12 |     IF(0 < call.genotype,
13 |       call.call_set_name,
14 |       NULL) AS sample_id,
15 |     SUM(IF(0 < call.genotype,
16 |         1,
17 |         0)) WITHIN RECORD AS num_samples_with_variant
18 |   FROM
19 |     [genomics-public-data:1000_genomes.variants]
20 |   HAVING
21 |     num_samples_with_variant = 1
22 |     AND sample_id IS NOT NULL)
23 | GROUP EACH BY
24 |   sample_id
25 | ORDER BY
26 |   sample_id
27 | 


--------------------------------------------------------------------------------
/annotations/README.md:
--------------------------------------------------------------------------------
 1 | Annotations
 2 | ============
 3 | 
 4 | Tute Genomics has provided a table of annotations for hg19 SNPs.
 5 | 
 6 | * For example queries, see  the [Platinum Genomes Annotation JOINs](../platinumGenomes)
 7 | data story for a few examples of how these tables can be used with variant data.
 8 | * Please see [Google Genomics Public Data](http://googlegenomics.readthedocs.io/en/latest/use_cases/discover_public_data/tute_genomics_public_data.html)
 9 | for more detail.
10 | 
11 | A handful of other annotation databases have been loaded to BigQuery for
12 | prototyping purposes.  See [provenance](./provenance) for details on the
13 | source of this data and how it may have been transformed prior to loading
14 | to BigQuery.
15 | 


--------------------------------------------------------------------------------
/pgp/sql/issues-with-the-variant-centric-approach/factor-v-leiden.sql:
--------------------------------------------------------------------------------
 1 | # Sample level data for rs6025 and hereditary thrombophilia trait  
 2 | # for use in the Factor V Leiden data story. 
 3 |  SELECT
 4 |   sample_id,
 5 |   chromosome,
 6 |   locusBegin,
 7 |   locusEnd,
 8 |   reference,
 9 |   allele1Seq,
10 |   allele2Seq,
11 |   zygosity,
12 |   has_Hereditary_thrombophilia_includes_Factor_V_Leiden_and_Prothrombin_G20210A AS has_Hereditary_thrombophilia
13 | FROM
14 |   [google.com:biggene:pgp.cgi_variants] AS var
15 | LEFT OUTER JOIN
16 |   [google.com:biggene:pgp.phenotypes] AS pheno
17 | ON
18 |   pheno.Participant = var.sample_id
19 |   WHERE
20 |   chromosome = 'chr1'
21 |   AND locusBegin <= 169519048
22 |   AND locusEnd >= 169519049
23 | ORDER BY
24 |   sample_id
25 | 


--------------------------------------------------------------------------------
/1000genomes/data-stories/README.md:
--------------------------------------------------------------------------------
 1 | Data Stories
 2 | ==========================
 3 | 
 4 | The following sections demonstrate some interactive exploration within the 1,000 Genomes data set.  
 5 | 
 6 |  * Getting Familiar with the Data
 7 |    * [Exploring the sample information data](./exploring-the-phenotypic-data)
 8 |    * [Exploring the variant data](./exploring-the-variant-data)
 9 |    * [Understanding Alternate Alleles in 1,000 Genomes](./understanding-alternate-alleles)
10 |  * Comparisons to Common Tools and Research Results
11 |    * [Reproducing the output of vcfstats](./reproducing-vcfstats)
12 |    * [Reproducing Allelic Frequencies](./reproducing-allelic-frequencies)
13 |    * [Reproducing the Hardy-Weinberg Equilibrium test](./reproducing-hardy-weinberg-equilibrium)
14 | 
15 | 


--------------------------------------------------------------------------------
/1000genomes/sql/reproducing-vcfstats/shared-variant-counts-brca1.sql:
--------------------------------------------------------------------------------
 1 | # Count the number of variants shared by none, shared by one sample, shared by
 2 | # two samples, etc... in BRCA1
 3 | SELECT
 4 |   num_samples_with_variant,
 5 |   COUNT(1) AS num_variants_shared_by_this_many_samples
 6 | FROM (
 7 |   SELECT
 8 |     reference_name,
 9 |     start,
10 |     END,
11 |     reference_bases,
12 |     GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt,
13 |     SUM(NOT EVERY(call.genotype <= 0)) WITHIN call AS num_samples_with_variant
14 |   FROM
15 |     [genomics-public-data:1000_genomes.variants]
16 |   WHERE
17 |     reference_name = '17'
18 |     AND start BETWEEN 41196311
19 |     AND 41277499
20 |     )
21 | GROUP BY
22 |   num_samples_with_variant
23 | ORDER BY
24 |   num_samples_with_variant


--------------------------------------------------------------------------------
/1000genomes/sql/phenotype_sql/ethnicity-ratio.sql:
--------------------------------------------------------------------------------
 1 | # Compute sample count and ratio by ethnicity
 2 | SELECT
 3 |   population,
 4 |   population_description,
 5 |   population_count,
 6 |   RATIO_TO_REPORT(population_count)
 7 | OVER
 8 |   (
 9 |   ORDER BY
10 |     population_count) AS population_ratio,
11 |   super_population,
12 |   super_population_description,
13 | from(
14 |   SELECT
15 |     population,
16 |     population_description,
17 |     super_population,
18 |     super_population_description,
19 |     COUNT(population) AS population_count,
20 |   FROM
21 |     [genomics-public-data:1000_genomes.sample_info]
22 |   WHERE
23 |     In_Phase1_Integrated_Variant_Set = TRUE
24 |   GROUP BY
25 |     population,
26 |     population_description,
27 |     super_population,
28 |     super_population_description)
29 | 


--------------------------------------------------------------------------------
/1000genomes/sql/shared-variant-counts.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | --
 3 | -- Count the number of variants shared by none, shared by one sample, two samples, etc...
 4 | --
 5 | SELECT
 6 |   num_samples_with_variant,
 7 |   COUNT(1) AS num_variants_shared_by_this_many_samples
 8 | FROM (
 9 |   SELECT
10 |     reference_name,
11 |     start,
12 |     `end`,
13 |     reference_bases,
14 |     alternate_bases[ORDINAL(1)] AS alt,  -- 1000 Genomes is biallelic.
15 |     (SELECT COUNTIF(EXISTS(SELECT gt
16 |                           FROM UNNEST(call.genotype) gt
17 |                           WHERE gt >= 1)) FROM v.call) AS num_samples_with_variant
18 |   FROM
19 |     `genomics-public-data.1000_genomes.variants` v
20 |   WHERE
21 |     reference_name NOT IN ("X", "Y", "MT"))
22 | GROUP BY
23 |   num_samples_with_variant
24 | ORDER BY
25 |   num_samples_with_variant
26 | 


--------------------------------------------------------------------------------
/1000genomes/sql/understanding-alternate-alleles/sample-likelihood.sql:
--------------------------------------------------------------------------------
 1 | # Get data sufficient to make a judgment upon this particular sample's call.
 2 | SELECT
 3 |   reference_name,
 4 |   start,
 5 |   reference_bases AS ref,
 6 |   GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt,
 7 |   GROUP_CONCAT(filter) WITHIN RECORD AS filters,
 8 |   avgpost,
 9 |   rsq
10 |   vt,
11 |   call.call_set_name AS sample_id,
12 |   call.phaseset AS phaseset,
13 |   NTH(1, call.genotype) WITHIN call AS first_allele,
14 |   NTH(2, call.genotype) WITHIN call AS second_allele,
15 |   call.ds AS ds,
16 |   GROUP_CONCAT(STRING(call.genotype_likelihood)) WITHIN call AS likelihoods,
17 | FROM
18 |   [genomics-public-data:1000_genomes.variants]
19 | WHERE
20 |   reference_name = '17'
21 |   AND start = 48515942
22 | HAVING
23 |   sample_id = 'HG00100'
24 | ORDER BY
25 |   alt
26 | 


--------------------------------------------------------------------------------
/pgp/data-stories/README.md:
--------------------------------------------------------------------------------
 1 | Data Stories
 2 | ==========================
 3 | 
 4 | The following sections demonstrate some interactive exploration within the PGP dataset.
 5 | 
 6 |  * Getting Familiar with the Data
 7 |    * [Comparing PGP to 1000 Genomes](./comparing-pgp-to-1000genomes)
 8 |    * [Issues with the Variant-Centric Approach](./issues-with-the-variant-centric-approach)
 9 |    * [A Comparison of Schemas and Data Encodings](./schema-comparisons)
10 |    
11 |    
12 | Have other data stories you would like to see here?  Have any data stories you would like to *share*?  Have *corrections to the biology* covered in this material?  Have query *simplifications* or *speed improvements*?  Let us know by [filing an issue](https://github.com/googlegenomics/bigquery-examples/issues) or [contacting us directly](mailto:google-genomics-contact@googlegroups.com).
13 | 


--------------------------------------------------------------------------------
/1000genomes/sql/reproducing-vcfstats/indel-length-counts-brca1.sql:
--------------------------------------------------------------------------------
 1 | # Count the number of INDELs differing from the reference allele by particular 
 2 | # lengths for BRCA1.
 3 | SELECT
 4 |   length_difference,
 5 |   COUNT(length_difference) AS count_of_indels_with_length_difference,
 6 | FROM (
 7 |   SELECT
 8 |     reference_name,
 9 |     start,
10 |     reference_bases,
11 |     LENGTH(reference_bases) AS ref_length,
12 |     alternate_bases AS allele,
13 |     LENGTH(alternate_bases) AS allele_length,
14 |     (LENGTH(alternate_bases) - LENGTH(reference_bases)) AS length_difference,
15 |     FROM
16 |       [genomics-public-data:1000_genomes.variants]
17 |     WHERE
18 |       reference_name = '17'
19 |       AND start BETWEEN 41196311
20 |       AND 41277499
21 |       AND vt ='INDEL'
22 |     )
23 | GROUP BY
24 |   length_difference
25 | ORDER BY
26 |   length_difference


--------------------------------------------------------------------------------
/1000genomes/sql/reproducing-vcfstats/private-variant-counts-brca1.sql:
--------------------------------------------------------------------------------
 1 | # Compute the number of variants within BRCA1 for a particular sample that are shared by
 2 | # no other samples.
 3 | SELECT
 4 |   COUNT(sample_id) AS private_variants_count,
 5 |   sample_id
 6 | FROM
 7 |   (
 8 |   SELECT
 9 |     reference_name,
10 |     start,
11 |     reference_bases,
12 |     IF(0 < call.genotype,
13 |       call.call_set_name,
14 |       NULL) AS sample_id,
15 |     SUM(IF(0 < call.genotype,
16 |         1,
17 |         0)) WITHIN RECORD AS num_samples_with_variant
18 |   FROM
19 |     [genomics-public-data:1000_genomes.variants]
20 |   WHERE
21 |     reference_name = '17'
22 |     AND start BETWEEN 41196311
23 |     AND 41277499
24 |   HAVING
25 |     num_samples_with_variant = 1
26 |     AND sample_id IS NOT NULL)
27 | GROUP EACH BY
28 |   sample_id
29 | ORDER BY
30 |   sample_id
31 | 


--------------------------------------------------------------------------------
/1000genomes/sql/sample-variant-counts-by-type-and-chromosome.sql:
--------------------------------------------------------------------------------
 1 | # Count the number of variants for each sample across the entirety of the 1,000
 2 | # Genomes dataset by variant type and chromosome.
 3 | SELECT
 4 |   reference_name,
 5 |   vt,
 6 |   sample_id,
 7 |   COUNT(sample_id) AS variant_count,
 8 | FROM
 9 |   (
10 |   SELECT
11 |     reference_name,
12 |     vt,
13 |     call.call_set_name AS sample_id,
14 |     NTH(1,
15 |       call.genotype) WITHIN call AS first_allele,
16 |     NTH(2,
17 |       call.genotype) WITHIN call AS second_allele,
18 |   FROM
19 |     [genomics-public-data:1000_genomes.variants]
20 |   HAVING
21 |     first_allele > 0
22 |     OR (second_allele IS NOT NULL
23 |         AND second_allele > 0))
24 | GROUP BY
25 |   sample_id,
26 |   reference_name,
27 |   vt
28 | ORDER BY
29 |   reference_name,
30 |   vt,
31 |   variant_count,
32 |   sample_id
33 | 


--------------------------------------------------------------------------------
/pgp/sql/issues-with-the-variant-centric-approach/factor-v-leiden-summary.sql:
--------------------------------------------------------------------------------
 1 | # Summary data for rs6025 and hereditary thrombophilia trait  
 2 | # for use in the Factor V Leiden data story. 
 3 |  SELECT
 4 |   COUNT(sample_id) AS sample_counts,
 5 |   chromosome,
 6 |   reference,
 7 |   allele1Seq,
 8 |   allele2Seq,
 9 |   has_Hereditary_thrombophilia_includes_Factor_V_Leiden_and_Prothrombin_G20210A AS has_Hereditary_thrombophilia
10 | FROM
11 |   [google.com:biggene:pgp.cgi_variants] AS var
12 | LEFT OUTER JOIN
13 |   [google.com:biggene:pgp.phenotypes] AS pheno
14 | ON
15 |   pheno.Participant = var.sample_id
16 |   WHERE
17 |   chromosome = 'chr1'
18 |   AND locusBegin <= 169519048
19 |   AND locusEnd >= 169519049
20 | GROUP BY
21 |   chromosome,
22 |   reference,
23 |   allele1Seq,
24 |   allele2Seq,
25 |   has_Hereditary_thrombophilia
26 | ORDER BY
27 |   sample_counts DESC
28 | 


--------------------------------------------------------------------------------
/pgp/sql/gvcf_variants_expanded/klotho.sql:
--------------------------------------------------------------------------------
 1 | # Sample level data for Klotho variant rs9536314 for use in the "amazing
 2 | # intelligence of PGP participants" data story, specifically joining two 
 3 | # tables to compare the different encodings.
 4 | SELECT
 5 |   contig_name,
 6 |   start_pos,
 7 |   end_pos,
 8 |   END,
 9 |   ref,
10 |   alt,
11 |   sample_id,
12 |   genotype
13 | FROM
14 |   FLATTEN(
15 |   SELECT
16 |     contig_name,
17 |     start_pos,
18 |     end_pos,
19 |     END,
20 |     reference_bases AS ref,
21 |     GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt,
22 |     call.callset_name AS sample_id,
23 |     GROUP_CONCAT(STRING(call.genotype),
24 |       '/') WITHIN call AS genotype,
25 |   FROM
26 |     [google.com:biggene:test.pgp_gvcf_variants_expanded2]
27 |   WHERE
28 |     contig_name = '13'
29 |     AND start_pos == 33628138
30 |     , call)
31 | ORDER BY
32 |   sample_id
33 | 


--------------------------------------------------------------------------------
/1000genomes/sql/sample-level-data-for-brca1.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | --
 3 | -- Retrieve sample-level information for BRCA1 variants.
 4 | --
 5 | SELECT
 6 |   reference_name,
 7 |   start,
 8 |   `end`,
 9 |   reference_bases,
10 |   ARRAY_TO_STRING(v.alternate_bases, ',') AS alts,
11 |   quality,
12 |   ARRAY_TO_STRING(v.filter, ',') AS filters,
13 |   vt,
14 |   ARRAY_TO_STRING(v.names, ',') AS names,
15 |   call.call_set_name,
16 |   call.phaseset,
17 |   (SELECT STRING_AGG(CAST(gt AS STRING)) from UNNEST(call.genotype) gt) AS genotype,
18 |   call.ds,
19 |   (SELECT STRING_AGG(CAST(lh AS STRING)) from UNNEST(call.genotype_likelihood) lh) AS likelihoods
20 | FROM
21 |   `genomics-public-data.1000_genomes.variants` v, v.call call
22 | WHERE
23 |   reference_name IN ('17', 'chr17')
24 |   AND start BETWEEN 41196311 AND 41277499 # per GRCh37
25 |   AND call_set_name = 'HG00100'
26 | ORDER BY
27 |   start,
28 |   alts
29 | 


--------------------------------------------------------------------------------
/pgp/sql/schema-comparisons/missingness-klotho.sql:
--------------------------------------------------------------------------------
 1 | # Missingness rate for Klotho variant rs9536314 in the "amazing
 2 | # intelligence of PGP participants" data story.
 3 | SELECT
 4 |   COUNT(sample_id) AS num_samples_called_for_position,
 5 |   SUM(called_count) AS num_alleles_called_for_position,
 6 |   1 - (SUM(called_count)/(172*2)) AS missingness_rate
 7 | FROM (
 8 |   SELECT
 9 |     contig_name,
10 |     start_pos,
11 |     end_pos,
12 |     END,
13 |     reference_bases,
14 |     GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt,
15 |     call.callset_name AS sample_id,
16 |     GROUP_CONCAT(STRING(call.genotype),
17 |       '/') WITHIN call AS genotype,
18 |     SUM(call.genotype >= 0) WITHIN RECORD as called_count,
19 |   FROM
20 |     [google.com:biggene:pgp.gvcf_variants]
21 |   WHERE
22 |     contig_name = '13'
23 |     AND start_pos <= 33628138
24 |     AND (end_pos = 33628139
25 |       OR END >= 33628139)
26 |     )
27 | 


--------------------------------------------------------------------------------
/pgp/sql/gvcf_variants/klotho.sql:
--------------------------------------------------------------------------------
 1 | # Sample level data for Klotho variant rs9536314 for use in the "amazing
 2 | # intelligence of PGP participants" data story, specifically joining two 
 3 | # tables to compare the different encodings.
 4 | SELECT
 5 |   contig_name,
 6 |   start_pos,
 7 |   end_pos,
 8 |   END,
 9 |   ref,
10 |   alt,
11 |   sample_id,
12 |   genotype
13 | FROM
14 |   FLATTEN(
15 |   SELECT
16 |     contig_name,
17 |     start_pos,
18 |     end_pos,
19 |     END,
20 |     reference_bases AS ref,
21 |     GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt,
22 |     call.callset_name AS sample_id,
23 |     GROUP_CONCAT(STRING(call.genotype),
24 |       '/') WITHIN call AS genotype,
25 |   FROM
26 |     [google.com:biggene:pgp.gvcf_variants]
27 |   WHERE
28 |     contig_name = '13'
29 |     AND start_pos <= 33628138
30 |     AND (end_pos >= 33628139
31 |       OR END >= 33628139)
32 |     ,
33 |     call)
34 | ORDER BY
35 |   sample_id
36 | 


--------------------------------------------------------------------------------
/1000genomes/sql/reproducing-vcfstats/variant-sample-counts-brca1.sql:
--------------------------------------------------------------------------------
 1 | # Count the number of samples that have the BRCA1 variant.
 2 | SELECT
 3 |   reference_name,
 4 |   start,
 5 |   reference_bases,
 6 |   SUM(first_allele > 0
 7 |     OR second_allele > 0) AS num_samples_with_variant
 8 |   FROM(
 9 |     SELECT
10 |       reference_name,
11 |       start,
12 |       reference_bases,
13 |       GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt,
14 |       NTH(1,
15 |         call.genotype) WITHIN call AS first_allele,
16 |       NTH(2,
17 |         call.genotype) WITHIN call AS second_allele,
18 |     FROM
19 |       [genomics-public-data:1000_genomes.variants]
20 |     WHERE
21 |       reference_name = '17'
22 |       AND start BETWEEN 41196311
23 |       AND 41277499
24 |       AND vt ='SNP'
25 |       )
26 |   GROUP BY
27 |     reference_name,
28 |     start,
29 |     reference_bases,
30 |     alt
31 | ORDER BY
32 |   num_samples_with_variant,
33 |   start
34 | 


--------------------------------------------------------------------------------
/pgp/sql/comparing-pgp-to-1000genomes/variant-counts-by-chromosome.sql:
--------------------------------------------------------------------------------
 1 | # Count the number of variants per chromosome.
 2 | SELECT
 3 |   reference_name,
 4 |   cnt,
 5 |   dataset
 6 | FROM (
 7 |   SELECT
 8 |     reference_name,
 9 |     COUNT(reference_name) AS cnt,
10 |     '1000Genomes' AS dataset
11 |   FROM
12 |     [genomics-public-data:1000_genomes.variants]
13 |   GROUP BY
14 |     reference_name
15 |     ),
16 |   (
17 |   SELECT
18 |     # Normalize the reference_name to match that found in 1,000 Genomes.
19 |     IF(reference_name = 'chrM', 'MT', SUBSTR(reference_name, 4)) AS reference_name,
20 |     COUNT(reference_name) AS cnt,
21 |     'PGP' AS dataset
22 |   FROM
23 |     [google.com:biggene:pgp_20150205.genome_calls]
24 |   # The source data was Complete Genomics which includes non-variant segments.
25 |   OMIT RECORD IF EVERY(alternate_bases IS NULL)
26 |   GROUP BY
27 |     reference_name)
28 | ORDER BY
29 |   reference_name,
30 |   dataset
31 | 


--------------------------------------------------------------------------------
/1000genomes/sql/understanding-alternate-alleles/count-by-var-type-chrom-pos-ref-dups.sql:
--------------------------------------------------------------------------------
 1 | # Count by variant type the number of alternate variants on chromosome 17 for the same
 2 | # start and reference base
 3 | SELECT
 4 |   vt,
 5 |   COUNT(vt) AS num_variant_type
 6 | FROM
 7 |   [genomics-public-data:1000_genomes.variants] AS variants
 8 | JOIN (
 9 |   SELECT
10 |     reference_name,
11 |     start,
12 |     reference_bases,
13 |     COUNT(start) AS num_alternates,
14 |   FROM
15 |     [genomics-public-data:1000_genomes.variants]
16 |   WHERE
17 |     reference_name = '17'
18 |   GROUP EACH BY
19 |     reference_name,
20 |     start,
21 |     reference_bases
22 |   HAVING
23 |     num_alternates > 1) AS dups
24 | ON
25 |   variants.reference_name = dups.reference_name
26 |   AND variants.start = dups.start
27 |   AND variants.reference_bases = dups.reference_bases
28 | WHERE
29 |   variants.reference_name = '17'
30 | GROUP EACH BY
31 |   vt
32 | ORDER BY
33 |   vt
34 | 


--------------------------------------------------------------------------------
/pgp/sql/comparing-pgp-to-1000genomes/parsed-genotype-counts.sql:
--------------------------------------------------------------------------------
 1 | # Count the number of sample genotypes, parsed into components.
 2 | SELECT
 3 |   first_allele,
 4 |   second_allele,
 5 |   dataset,
 6 |   # Convert integer to float to avoid numeric overflow in R for integers.
 7 |   FLOAT(COUNT(1)) AS cnt
 8 | FROM (
 9 |   SELECT
10 |     NTH(1, call.genotype) WITHIN call AS first_allele,
11 |     NTH(2, call.genotype) WITHIN call AS second_allele,
12 |     '1000Genomes' AS dataset
13 |   FROM
14 |     [genomics-public-data:1000_genomes.variants]
15 |   OMIT RECORD IF reference_name IN ('X', 'Y', 'MT')),
16 |   (
17 |   SELECT
18 |     NTH(1, call.genotype) WITHIN call AS first_allele,
19 |     NTH(2, call.genotype) WITHIN call AS second_allele,
20 |     'PGP' AS dataset
21 |   FROM
22 |     [google.com:biggene:pgp_20150205.genome_calls]
23 |   OMIT RECORD IF reference_name IN ('chrX', 'chrY', 'chrM'))
24 | GROUP BY
25 |   first_allele,
26 |   second_allele,
27 |   dataset
28 | 


--------------------------------------------------------------------------------
/1000genomes/sql/understanding-alternate-alleles/count-by-var-type-chrom-pos-ref-singles.sql:
--------------------------------------------------------------------------------
 1 | # Count by variant type the number of variants on chromosome 17 unique for a
 2 | # start and reference base
 3 | SELECT
 4 |   vt,
 5 |   COUNT(vt) AS num_variant_type
 6 | FROM
 7 |   [genomics-public-data:1000_genomes.variants] AS variants
 8 | JOIN EACH (
 9 |   SELECT
10 |     reference_name,
11 |     start,
12 |     reference_bases,
13 |     COUNT(start) AS num_alternates
14 |   FROM
15 |     [genomics-public-data:1000_genomes.variants]
16 |   WHERE
17 |     reference_name = '17'
18 |   GROUP EACH BY
19 |     reference_name,
20 |     start,
21 |     reference_bases
22 |   HAVING
23 |     num_alternates = 1) AS singles
24 | ON
25 |   variants.reference_name = singles.reference_name
26 |   AND variants.start = singles.start
27 |   AND variants.reference_bases = singles.reference_bases
28 | WHERE
29 |   variants.reference_name = '17'
30 | GROUP EACH BY
31 |   vt
32 | ORDER BY
33 |   vt
34 | 


--------------------------------------------------------------------------------
/1000genomes/sql/ti-tv-ratio.sql:
--------------------------------------------------------------------------------
 1 | # Compute the Ti/Tv ratio of the 1,000 Genomes dataset.
 2 | SELECT
 3 |   transitions,
 4 |   transversions,
 5 |   transitions/transversions AS titv
 6 | FROM (
 7 |   SELECT
 8 |     SUM(IF(mutation IN ('A->G',
 9 |           'G->A',
10 |           'C->T',
11 |           'T->C'),
12 |         INTEGER(num_snps),
13 |         INTEGER(0))) AS transitions,
14 |     SUM(IF(mutation IN ('A->C',
15 |           'C->A',
16 |           'G->T',
17 |           'T->G',
18 |           'A->T',
19 |           'T->A',
20 |           'C->G',
21 |           'G->C'),
22 |         INTEGER(num_snps),
23 |         INTEGER(0))) AS transversions,
24 |   FROM (
25 |     SELECT
26 |       CONCAT(reference_bases,
27 |         CONCAT(STRING('->'),
28 |           alternate_bases)) AS mutation,
29 |       COUNT(alternate_bases) AS num_snps,
30 |     FROM
31 |       [genomics-public-data:1000_genomes.variants]
32 |     WHERE
33 |       vt = 'SNP'
34 |     GROUP BY
35 |       mutation
36 |     ORDER BY
37 |       mutation))
38 | 


--------------------------------------------------------------------------------
/pgp/sql/comparing-pgp-to-1000genomes/taking-a-closer-look-at-variant-types.sql:
--------------------------------------------------------------------------------
 1 | # Inner SELECT filters just the records in which we are interested.
 2 | # Outer SELECT performs our analysis, in this case just a count of the genotypes
 3 | # at a particular position in chromosome 3.
 4 | SELECT
 5 |   reference_name,
 6 |   start,
 7 |   reference_bases,
 8 |   alternate_bases,
 9 |   genotype,
10 |   COUNT(genotype) AS number_of_individuals,
11 | FROM (
12 |   SELECT
13 |     reference_name,
14 |     start,
15 |     reference_bases,
16 |     GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alternate_bases,
17 |     call.callset_name,
18 |     GROUP_CONCAT(STRING(call.genotype)) WITHIN call AS genotype,
19 |   FROM
20 |     [google.com:biggene:pgp_20150205.genome_calls]
21 |   WHERE
22 |     reference_name = 'chr3'
23 |     AND start = 65440409)
24 | GROUP BY
25 |   reference_name,
26 |   start,
27 |   reference_bases,
28 |   alternate_bases,
29 |   genotype
30 | ORDER BY
31 |   alternate_bases,
32 |   number_of_individuals DESC
33 | 


--------------------------------------------------------------------------------
/1000genomes/sql/heterozygous-homozygous-ratio.sql:
--------------------------------------------------------------------------------
 1 | # Count the homozygous and heterozygous variants for each sample across the
 2 | # entirety of the 1,000 Genomes dataset.
 3 | SELECT
 4 |   sample_id,
 5 |   SUM(IF(0 = first_allele
 6 |       AND 0 = second_allele,
 7 |       1,
 8 |       0)) AS hom_RR_count,
 9 |   SUM(IF(first_allele = second_allele
10 |       AND first_allele > 0,
11 |       1,
12 |       0)) AS hom_AA_count,
13 |   SUM(IF((first_allele != second_allele
14 |         OR second_allele IS NULL)
15 |       AND (first_allele > 0
16 |         OR second_allele > 0),
17 |       1,
18 |       0)) AS het_RA_count
19 | FROM (
20 |   SELECT
21 |     reference_name,
22 |     call.call_set_name AS sample_id,
23 |     NTH(1,
24 |       call.genotype) WITHIN call AS first_allele,
25 |     NTH(2,
26 |       call.genotype) WITHIN call AS second_allele,
27 |   FROM
28 |     [genomics-public-data:1000_genomes.variants]
29 |   WHERE
30 |     reference_name != 'Y' AND reference_name != 'M'
31 |   )
32 | GROUP BY
33 |   sample_id
34 | ORDER BY
35 |   sample_id
36 | 


--------------------------------------------------------------------------------
/1000genomes/sql/minimum-allelic-frequency-by-ethnicity.sql:
--------------------------------------------------------------------------------
 1 | # Count the variation for each sample including phenotypic traits
 2 | SELECT
 3 |   samples.call.call_set_name AS sample_id,
 4 |   gender,
 5 |   population,
 6 |   super_population,
 7 |   COUNT(samples.call.call_set_name) AS num_variants_for_sample,
 8 |   SUM(samples.af >= 0.05) AS common_variant,
 9 |   SUM(samples.af < 0.05 AND samples.af > 0.005) AS middle_variant,
10 |   SUM(samples.af <= 0.005 AND samples.af > 0.001) AS rare_variant,
11 |   SUM(samples.af <= 0.001) AS very_rare_variant,
12 | FROM
13 |   FLATTEN((
14 |     SELECT
15 |       af,
16 |       vt,
17 |       call.call_set_name,
18 |     FROM
19 |       [genomics-public-data:1000_genomes.variants]
20 |     WHERE
21 |       vt = 'SNP'
22 |     OMIT call IF EVERY(call.genotype <= 0)),
23 |     call) AS samples
24 | JOIN
25 |   [genomics-public-data:1000_genomes.sample_info] p
26 | ON
27 |   samples.call.call_set_name = p.sample
28 | GROUP BY
29 |   sample_id,
30 |   gender,
31 |   population,
32 |   super_population
33 | ORDER BY
34 |   sample_id
35 | 


--------------------------------------------------------------------------------
/1000genomes/sql/variant-hotspots.sql:
--------------------------------------------------------------------------------
 1 | # Summarize the variant counts by 10,000 start-wide windows in order to identify
 2 | # variant hotspots within a chromosome for all samples.
 3 | SELECT
 4 |   reference_name,
 5 |   window,
 6 |   window * 10000 AS window_start,
 7 |   ((window * 10000) + 9999) AS window_end,
 8 |   MIN(start) AS min_variant_start,
 9 |   MAX(start) AS max_variant_start,
10 |   COUNT(sample_id) AS num_variants_in_window,
11 | FROM (
12 |   SELECT
13 |     reference_name,
14 |     start,
15 |     INTEGER(FLOOR(start / 10000)) AS window,
16 |     call.call_set_name AS sample_id,
17 |     NTH(1,
18 |       call.genotype) WITHIN call AS first_allele,
19 |     NTH(2,
20 |       call.genotype) WITHIN call AS second_allele,
21 |   FROM
22 |     [genomics-public-data:1000_genomes.variants]
23 |   HAVING
24 |     first_allele > 0
25 |       OR (second_allele IS NOT NULL
26 |             AND second_allele > 0))
27 | GROUP BY
28 |   reference_name,
29 |   window,
30 |   window_start,
31 |   window_end,
32 | ORDER BY
33 |   num_variants_in_window DESC,
34 |   window
35 | 


--------------------------------------------------------------------------------
/1000genomes/sql/reproducing-vcfstats/ti-tv-ratio-brca1.sql:
--------------------------------------------------------------------------------
 1 | # Compute the Ti/Tv ratio for BRCA1.
 2 | SELECT
 3 |   transitions,
 4 |   transversions,
 5 |   transitions/transversions AS titv
 6 | FROM (
 7 |   SELECT
 8 |     SUM(IF(mutation IN ('A->G',
 9 |           'G->A',
10 |           'C->T',
11 |           'T->C'),
12 |         INTEGER(num_snps),
13 |         INTEGER(0))) AS transitions,
14 |     SUM(IF(mutation IN ('A->C',
15 |           'C->A',
16 |           'G->T',
17 |           'T->G',
18 |           'A->T',
19 |           'T->A',
20 |           'C->G',
21 |           'G->C'),
22 |         INTEGER(num_snps),
23 |         INTEGER(0))) AS transversions,
24 |   FROM (
25 |     SELECT
26 |       CONCAT(reference_bases,
27 |         CONCAT(STRING('->'),
28 |           alternate_bases)) AS mutation,
29 |       COUNT(alternate_bases) AS num_snps,
30 |     FROM
31 |       [genomics-public-data:1000_genomes.variants]
32 |     WHERE
33 |       reference_name = '17'
34 |         AND start BETWEEN 41196311
35 |         AND 41277499
36 |         AND vt = 'SNP'
37 |     GROUP BY
38 |       mutation
39 |     ORDER BY
40 |       mutation))


--------------------------------------------------------------------------------
/platinumGenomes/sql/sample-snps-by-exonic-function.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 |   --
 3 |   -- Count SNPs by functional impact for each sample in Platinum Genomes.
 4 |   --
 5 | WITH
 6 |   sample_variants AS (
 7 |   SELECT
 8 |     REGEXP_EXTRACT(reference_name, r'chr(.+)') AS chr,
 9 |     start AS start,
10 |     reference_bases,
11 |     alt,
12 |     call.call_set_name
13 |   FROM
14 |     `genomics-public-data.platinum_genomes.variants` v,
15 |     v.call call,
16 |     v.alternate_bases alt WITH OFFSET alt_offset
17 |   WHERE
18 |     -- Require that at least one genotype matches this alternate.
19 |     EXISTS (SELECT gt FROM UNNEST(call.genotype) gt WHERE gt = alt_offset+1)
20 |     )
21 |   --
22 |   --
23 | SELECT
24 |   call_set_name,
25 |   ExonicFunc,
26 |   COUNT(ExonicFunc) AS variant_count
27 | FROM
28 |   `silver-wall-555.TuteTable.hg19` AS annots
29 | JOIN sample_variants AS vars
30 | ON
31 |   vars.chr = annots.Chr
32 |   AND vars.start = annots.Start
33 |   AND vars.reference_bases = annots.Ref
34 |   AND vars.alt = annots.Alt
35 | WHERE
36 |   ExonicFunc IS NOT NULL
37 | GROUP BY
38 |   call_set_name,
39 |   ExonicFunc
40 | ORDER BY
41 |   call_set_name,
42 |   ExonicFunc
43 | 


--------------------------------------------------------------------------------
/1000genomes/sql/sample-variant-hotspots.sql:
--------------------------------------------------------------------------------
 1 | # Summarize the variant counts for a particular sample by 10,000 start-wide windows
 2 | # in order to identify variant hotspots within a chromosome for a particular sample.
 3 | SELECT
 4 |   reference_name,
 5 |   window,
 6 |   window * 10000 AS window_start,
 7 |   ((window * 10000) + 9999) AS window_end,
 8 |   MIN(start) AS min_variant_start,
 9 |   MAX(start) AS max_variant_start,
10 |   sample_id,
11 |   COUNT(sample_id) AS num_variants_in_window,
12 | FROM (
13 |   SELECT
14 |     reference_name,
15 |     start,
16 |     INTEGER(FLOOR(start / 10000)) AS window,
17 |     call.call_set_name AS sample_id,
18 |     NTH(1,
19 |       call.genotype) WITHIN call AS first_allele,
20 |     NTH(2,
21 |       call.genotype) WITHIN call AS second_allele,
22 |   FROM
23 |     [genomics-public-data:1000_genomes.variants]
24 |   WHERE
25 |     call.call_set_name = 'HG00096'
26 |   HAVING
27 |     first_allele > 0
28 |     OR (second_allele IS NOT NULL
29 |         AND second_allele > 0))
30 | GROUP BY
31 |   reference_name,
32 |   window,
33 |   window_start,
34 |   window_end,
35 |   sample_id
36 | ORDER BY
37 |   num_variants_in_window DESC,
38 |   window
39 | 


--------------------------------------------------------------------------------
/1000genomes/sql/minimum-allelic-frequency-by-ethnicity-no-sex-chromosomes.sql:
--------------------------------------------------------------------------------
 1 | # Count the variation for each sample including phenotypic traits but excluding
 2 | # sex chromosomes.
 3 | SELECT
 4 |   samples.call.call_set_name AS sample_id,
 5 |   gender,
 6 |   population,
 7 |   super_population,
 8 |   COUNT(samples.call.call_set_name) AS num_variants_for_sample,
 9 |   SUM(samples.af >= 0.05) AS common_variant,
10 |   SUM(samples.af < 0.05 AND samples.af > 0.005) AS middle_variant,
11 |   SUM(samples.af <= 0.005 AND samples.af > 0.001) AS rare_variant,
12 |   SUM(samples.af <= 0.001) AS very_rare_variant,
13 | FROM
14 |   FLATTEN((
15 |     SELECT
16 |       af,
17 |       vt,
18 |       call.call_set_name,
19 |     FROM
20 |       [genomics-public-data:1000_genomes.variants]
21 |     WHERE
22 |       vt = 'SNP'
23 |       AND reference_name != 'X'
24 |       AND reference_name != 'Y'
25 |     OMIT call IF EVERY(call.genotype <= 0)),
26 |     call) AS samples
27 | JOIN
28 |   [genomics-public-data:1000_genomes.sample_info] p
29 | ON
30 |   samples.call.call_set_name = p.sample
31 | GROUP BY
32 |   sample_id,
33 |   gender,
34 |   population,
35 |   super_population
36 | ORDER BY
37 |   sample_id
38 | 


--------------------------------------------------------------------------------
/pgp/sql/gvcf_variants/allelic-frequency-comparison.sql:
--------------------------------------------------------------------------------
 1 | # PGP vs. 1,000 Genomes allelic frequency comparison for BRCA1 variants.
 2 | SELECT
 3 |   contig_name,
 4 |   pgp.reference_bases AS reference_bases,
 5 |   start_pos,
 6 |   allele,
 7 |   pgp_freq,
 8 |   af,
 9 |   eur_af,
10 |   afr_af,
11 |   asn_af,
12 |   amr_af
13 | FROM (
14 |     FLATTEN((
15 |       SELECT
16 |         reference_name,
17 |         start,
18 |         reference_bases,
19 |         alternate_bases,
20 |         AF,
21 |         AFR_AF,
22 |         AMR_AF,
23 |         ASN_AF,
24 |         EUR_AF
25 |       FROM
26 |         [genomics-public-data:1000_genomes.variants]),
27 |       alternate_bases)) AS kg
28 | JOIN
29 |   EACH (
30 |   SELECT
31 |     contig_name,
32 |     reference_bases,
33 |     start_pos,
34 |     allele,
35 |     freq AS pgp_freq
36 |   FROM
37 |     [google.com:biggene:pgp_analysis_results.gvcf_variants_allelic_frequency]
38 |     ) AS pgp
39 | ON
40 |   pgp.contig_name = kg.reference_name
41 |   AND pgp.start_pos = kg.start
42 |   AND pgp.reference_bases = kg.reference_bases
43 |   AND pgp.allele = kg.alternate_bases
44 | WHERE
45 |   kg.reference_name = '17'
46 |   AND kg.start BETWEEN 41196312
47 |   AND 41277500
48 | 


--------------------------------------------------------------------------------
/pgp/sql/cgi_variants/allelic-frequency-comparison.sql:
--------------------------------------------------------------------------------
 1 | # PGP vs. 1,000 Genomes allelic frequency comparison for BRCA1 variants.
 2 | SELECT
 3 |   chromosome,
 4 |   reference,
 5 |   locusBegin,
 6 |   locusEnd,
 7 |   allele,
 8 |   pgp_freq,
 9 |   af,
10 |   eur_af,
11 |   afr_af,
12 |   asn_af,
13 |   amr_af
14 | FROM (
15 |     FLATTEN((
16 |       SELECT
17 |         reference_name,
18 |         start,
19 |         reference_bases,
20 |         alternate_bases,
21 |         AF,
22 |         AFR_AF,
23 |         AMR_AF,
24 |         ASN_AF,
25 |         EUR_AF
26 |       FROM
27 |         [genomics-public-data:1000_genomes.variants]),
28 |       alternate_bases)) AS kg
29 | JOIN
30 |   EACH (
31 |   SELECT
32 |     chromosome,
33 |     REGEXP_EXTRACT(chromosome,
34 |       r'chr(\d+)') AS contig,
35 |     reference,
36 |     locusBegin + 1 AS position,
37 |     locusBegin,
38 |     locusEnd,
39 |     allele,
40 |     freq AS pgp_freq
41 |   FROM
42 |     [google.com:biggene:pgp_analysis_results.cgi_variants_allelic_frequency]
43 |     ) AS pgp
44 | ON
45 |   pgp.contig = kg.reference_name
46 |   AND pgp.position = kg.start
47 |   AND pgp.reference = kg.reference_bases
48 |   AND pgp.allele = kg.alternate_bases
49 | WHERE
50 |   kg.reference_name = '17'
51 |   AND kg.start BETWEEN 41196312
52 |   AND 41277500
53 | 


--------------------------------------------------------------------------------
/pgp/sql/comparing-pgp-to-1000genomes/variant-counts-by-type-and-chromosome.sql:
--------------------------------------------------------------------------------
 1 | # Count the number of variants by variant type and chromosome.
 2 | SELECT
 3 |   reference_name,
 4 |   vt,
 5 |   cnt,
 6 |   dataset
 7 | FROM (
 8 |   SELECT
 9 |     # Normalize the reference_name to match that found in 1,000 Genomes.
10 |     IF(reference_name = 'chrM', 'MT', SUBSTR(reference_name, 4)) AS reference_name,
11 |     IF(ref_len = 1 AND alt_len = 1, "SNP", "INDEL") AS vt,
12 |     COUNT(reference_name) AS cnt,
13 |     'PGP' AS dataset
14 |   FROM (
15 |     SELECT
16 |       reference_name,
17 |       svtype,
18 |       LENGTH(reference_bases) AS ref_len,
19 |       MAX(LENGTH(alternate_bases)) WITHIN RECORD AS alt_len,
20 |     FROM
21 |       [google.com:biggene:pgp_20150205.genome_calls]
22 |     # The source data was Complete Genomics which includes non-variant segments.
23 |     OMIT RECORD IF EVERY(alternate_bases IS NULL)
24 |       )
25 |   GROUP BY
26 |     reference_name,
27 |     vt
28 |     ),
29 |   (
30 |   SELECT
31 |     reference_name,
32 |     IF(vt IS NULL, "not specified", vt) AS vt,
33 |     COUNT(reference_name) AS cnt,
34 |     '1000Genomes' AS dataset
35 |   FROM
36 |     [genomics-public-data:1000_genomes.variants]
37 |   GROUP BY
38 |     reference_name,
39 |     vt
40 |     ),
41 | ORDER BY
42 |   reference_name,
43 |   dataset,
44 |   vt
45 | 


--------------------------------------------------------------------------------
/platinumGenomes/sql/cohort-rare-pathenogenic-snps.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 |   --
 3 |   -- Return all SNPs from the Platinum Genomes cohort that are:
 4 |   --   annotated as 'pathogenic' in ClinVar
 5 |   --   with observed population frequency less than 1%
 6 |   --
 7 | WITH
 8 |   cohort_variants AS (
 9 |   SELECT
10 |     REGEXP_EXTRACT(reference_name, r'chr(.+)') AS chr,
11 |     start AS start,
12 |     reference_bases,
13 |     alt
14 |   FROM
15 |     `genomics-public-data.platinum_genomes.variants` v,
16 |     v.alternate_bases alt WITH OFFSET alt_offset
17 |   WHERE
18 |     -- Require that at least one sample in the cohort has this variant.
19 |     EXISTS(SELECT gt FROM UNNEST(v.call) call, UNNEST(call.genotype) gt WHERE gt = alt_offset+1)
20 |     )
21 |   --
22 |   --
23 | SELECT
24 |   annots.Chr,
25 |   annots.Start,
26 |   Ref,
27 |   annots.Alt,
28 |   Func,
29 |   Gene,
30 |   PopFreqMax,
31 |   ExonicFunc,
32 |   ClinVar_SIG,
33 |   ClinVar_DIS
34 | FROM
35 |   `silver-wall-555.TuteTable.hg19` AS annots
36 | JOIN
37 |   cohort_variants AS vars
38 | ON
39 |   vars.chr = annots.Chr
40 |   AND vars.start = annots.Start
41 |   AND vars.reference_bases = annots.Ref
42 |   AND vars.alt = annots.Alt
43 | WHERE
44 |   PopFreqMax <= 0.01
45 |   AND ClinVar_SIG LIKE '%pathogenic%'
46 |   AND NOT CLinVar_SIG LIKE '%non-pathogenic%'
47 | ORDER BY
48 |   Chr,
49 |   Start,
50 |   Ref,
51 |   Alt
52 | 


--------------------------------------------------------------------------------
/1000genomes/sql/understanding-alternate-alleles/sample-chrom-pos-ref-dups.sql:
--------------------------------------------------------------------------------
 1 | # Get sample alleles for some specific variants.
 2 | # TODO(deflaux): update this to a user-defined function to generalize
 3 | # across more than two alternates.  For more info, see
 4 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377
 5 | SELECT
 6 |   reference_name,
 7 |   start,
 8 |   alt,
 9 |   reference_bases,
10 |   sample_id,
11 |   CASE
12 |   WHEN 0 = first_allele THEN reference_bases
13 |   WHEN 1 = first_allele THEN alt1
14 |   WHEN 2 = first_allele THEN alt2 END AS first_allele,
15 |   CASE
16 |   WHEN 0 = second_allele THEN reference_bases
17 |   WHEN 1 = second_allele THEN alt1
18 |   WHEN 2 = second_allele THEN alt2 END AS second_allele,
19 | FROM(
20 |   SELECT
21 |     reference_name,
22 |     start,
23 |     GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt,
24 |     reference_bases,
25 |     call.call_set_name AS sample_id,
26 |     NTH(1,
27 |       alternate_bases) WITHIN RECORD AS alt1,
28 |     NTH(2,
29 |       alternate_bases) WITHIN RECORD AS alt2,
30 |     NTH(1, call.genotype) WITHIN call AS first_allele,
31 |     NTH(2, call.genotype) WITHIN call AS second_allele,
32 |   FROM
33 |     [genomics-public-data:1000_genomes.variants]
34 |   WHERE
35 |     reference_name = '17'
36 |     AND start = 48515942
37 |   HAVING
38 |     sample_id = 'HG00100' OR sample_id = 'HG00101')
39 | ORDER BY
40 |   alt,
41 |   sample_id
42 | 


--------------------------------------------------------------------------------
/platinumGenomes/sql/sample-rare-pathenogenic-snps.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 |   --
 3 |   -- Return SNPs for sample NA12878 that are:
 4 |   --   annotated as 'pathogenic' in ClinVar
 5 |   --   with observed population frequency less than 1%
 6 |   --
 7 | WITH
 8 |   sample_variants AS (
 9 |   SELECT
10 |     REGEXP_EXTRACT(reference_name, r'chr(.+)') AS chr,
11 |     start AS start,
12 |     reference_bases,
13 |     alt,
14 |     call.call_set_name
15 |   FROM
16 |     `genomics-public-data.platinum_genomes.variants` v,
17 |     v.call call,
18 |     v.alternate_bases alt WITH OFFSET alt_offset
19 |   WHERE
20 |     call_set_name = 'NA12878'
21 |     -- Require that at least one genotype matches this alternate.
22 |     AND EXISTS (SELECT gt FROM UNNEST(call.genotype) gt WHERE gt = alt_offset+1) )
23 |   --
24 |   --
25 | SELECT
26 |   call_set_name,
27 |   annots.Chr,
28 |   annots.Start,
29 |   Ref,
30 |   annots.Alt,
31 |   Func,
32 |   Gene,
33 |   PopFreqMax,
34 |   ExonicFunc,
35 |   ClinVar_SIG,
36 |   ClinVar_DIS
37 | FROM
38 |   `silver-wall-555.TuteTable.hg19` AS annots
39 | JOIN
40 |   sample_variants AS vars
41 | ON
42 |   vars.chr = annots.Chr
43 |   AND vars.start = annots.Start
44 |   AND vars.reference_bases = annots.Ref
45 |   AND vars.alt = annots.Alt
46 | WHERE
47 |   PopFreqMax <= 0.01
48 |   AND ClinVar_SIG LIKE '%pathogenic%'
49 |   AND NOT CLinVar_SIG LIKE '%non-pathogenic%'
50 | ORDER BY
51 |   Chr,
52 |   Start,
53 |   Ref,
54 |   Alt,
55 |   call_set_name
56 | 


--------------------------------------------------------------------------------
/pgp/sql/schema-comparisons/call-counts.sql:
--------------------------------------------------------------------------------
 1 | # Call counts for the PGP data encoded four different ways.
 2 | SELECT
 3 |   chromosome,
 4 |   num_records,
 5 |   num_variants,
 6 |   dataset
 7 | FROM
 8 |   (
 9 |   SELECT
10 |     SUBSTR(chromosome,
11 |       4) AS chromosome,
12 |     COUNT(1) AS num_records,
13 |     SUM(reference != '=') AS num_variants,
14 |     'cgi_variants' AS dataset
15 |   FROM
16 |     [google.com:biggene:pgp.cgi_variants]
17 |   # Skip the genomes we were unable to convert to VCF/gVCF
18 |   OMIT RECORD IF 
19 |     sample_id = 'huEDF7DA' OR sample_id = 'hu34D5B9'
20 |   GROUP BY
21 |     chromosome),
22 |   (
23 |   SELECT
24 |     contig_name AS chromosome,
25 |     COUNT(1) AS num_records,
26 |     SUM(reference_bases != 'N') AS num_variants,
27 |     'variants' AS dataset
28 |   FROM
29 |     [google.com:biggene:pgp.variants]
30 |   GROUP BY
31 |     chromosome),
32 |   (
33 |   SELECT
34 |     contig_name AS chromosome,
35 |     COUNT(1) AS num_records,
36 |     SUM(reference_bases != 'N') AS num_variants,
37 |     'gvcf_variants' AS dataset
38 |   FROM
39 |     [google.com:biggene:pgp.gvcf_variants]
40 |   GROUP BY
41 |     chromosome),
42 |   (
43 |   SELECT
44 |     contig_name AS chromosome,
45 |     COUNT(1) AS num_records,
46 |     SUM(reference_bases != 'N') AS num_variants,
47 |     'gvcf_variants_expanded' AS dataset
48 |   FROM
49 |     [google.com:biggene:test.pgp_gvcf_variants_expanded2]
50 |   GROUP BY
51 |     chromosome)
52 | ORDER BY
53 |   chromosome,
54 |   dataset
55 | 


--------------------------------------------------------------------------------
/pgp/sql/cgi_variants/allele-count.sql:
--------------------------------------------------------------------------------
 1 | # Count the occurence of each variant allele across all participants in the
 2 | # dataset.  This returns a large result so be sure to materialize it into a
 3 | # table for subsequent use. 
 4 | SELECT
 5 |   chromosome,
 6 |   reference,
 7 |   # This 'bin' can be use in subsequent interval JOINs
 8 |   INTEGER(FLOOR(locusBegin / 5000)) AS bin,
 9 |   locusBegin,
10 |   locusEnd,
11 |   allele,
12 |   SUM(cnt) AS alternate_allele_count,
13 | FROM (
14 |   SELECT
15 |     chromosome,
16 |     reference,
17 |     locusBegin,
18 |     locusEnd,
19 |     allele1Seq AS allele,
20 |     COUNT(1) AS cnt
21 |   FROM
22 |     [google.com:biggene:pgp.cgi_variants] 
23 |   WHERE
24 |     (reference != '=' OR reference IS NULL)
25 |     AND allele1Seq != '?'
26 |     AND (reference != allele1Seq OR reference IS NULL)
27 |   GROUP EACH BY
28 |     chromosome,
29 |     reference,
30 |     locusBegin,
31 |     locusEnd,
32 |     allele),
33 |   (
34 |   SELECT
35 |     chromosome,
36 |     reference,
37 |     locusBegin,
38 |     locusEnd,
39 |     allele2Seq AS allele,
40 |     COUNT(1) AS cnt
41 |   FROM
42 |     [google.com:biggene:pgp.cgi_variants]
43 |   WHERE
44 |     (reference != '=' OR reference IS NULL)
45 |     AND allele2Seq != '?'
46 |     AND (reference != allele2Seq OR reference IS NULL)
47 |   GROUP EACH BY
48 |     chromosome,
49 |     reference,
50 |     locusBegin,
51 |     locusEnd,
52 |     allele)
53 | GROUP EACH BY
54 |   chromosome,
55 |   reference,
56 |   bin,
57 |   locusBegin,
58 |   locusEnd,
59 |   allele
60 | 


--------------------------------------------------------------------------------
/1000genomes/sql/allelic-frequency.sql:
--------------------------------------------------------------------------------
 1 | # The following query computes the allelic frequency for BRCA1 variants in the
 2 | # 1,000 Genomes dataset.
 3 | SELECT
 4 |   reference_name,
 5 |   start,
 6 |   reference_bases,
 7 |   alternate_bases,
 8 |   alt,
 9 |   SUM(ref_count)+SUM(alt_count) AS num_sample_alleles,
10 |   SUM(ref_count) AS ref_cnt,
11 |   SUM(alt_count) AS alt_cnt,
12 |   SUM(ref_count)/(SUM(ref_count)+SUM(alt_count)) AS ref_freq,
13 |   SUM(alt_count)/(SUM(ref_count)+SUM(alt_count)) AS alt_freq,
14 | FROM (
15 |   SELECT
16 |     reference_name,
17 |     start,
18 |     reference_bases,
19 |     alternate_bases,
20 |     alt,
21 |     SUM(INTEGER(0 = call.genotype)) WITHIN RECORD AS ref_count,
22 |     SUM(INTEGER(alt = call.genotype)) WITHIN RECORD AS alt_count
23 |   FROM
24 |     FLATTEN(
25 |       FLATTEN((
26 |         SELECT
27 |           reference_name,
28 |           start,
29 |           reference_bases,
30 |           alternate_bases,
31 |           POSITION(alternate_bases) AS alt,
32 |           call.call_set_name,
33 |           call.genotype,
34 |         FROM
35 |           [genomics-public-data:1000_genomes.variants]
36 |         WHERE
37 |           reference_name = '17'
38 |           AND start BETWEEN 41196311
39 |           AND 41277499
40 |           AND vt='SNP'
41 |           ),
42 |         call),
43 |       alt))
44 | GROUP BY
45 |   reference_name,
46 |   start,
47 |   reference_bases,
48 |   alternate_bases,
49 |   alt
50 | ORDER BY
51 |   reference_name,
52 |   start,
53 |   reference_bases,
54 |   alt,
55 |   alternate_bases
56 | 


--------------------------------------------------------------------------------
/pgp/sql/cgi_variants/ti-tv-ratio.sql:
--------------------------------------------------------------------------------
 1 | # Compute the Ti/Tv ratio for each participant in the PGP dataset.
 2 | SELECT
 3 |   sample_id,
 4 |   transitions,
 5 |   transversions,
 6 |   transitions/transversions AS titv
 7 | FROM (
 8 |   SELECT
 9 |     sample_id,
10 |     SUM(IF(mutation1 IN ('A->G',
11 |           'G->A',
12 |           'C->T',
13 |           'T->C'),
14 |         1,
15 |         0) + IF(mutation2 IN ('A->G',
16 |           'G->A',
17 |           'C->T',
18 |           'T->C'),
19 |         1,
20 |         0)) AS transitions,
21 |     SUM(IF(mutation1 IN ('A->C',
22 |           'C->A',
23 |           'G->T',
24 |           'T->G',
25 |           'A->T',
26 |           'T->A',
27 |           'C->G',
28 |           'G->C'),
29 |         1,
30 |         0) + IF(mutation2 IN ('A->C',
31 |           'C->A',
32 |           'G->T',
33 |           'T->G',
34 |           'A->T',
35 |           'T->A',
36 |           'C->G',
37 |           'G->C'),
38 |         1,
39 |         0)) AS transversions,
40 |   FROM (
41 |     SELECT
42 |       sample_id,
43 |       CONCAT(reference,
44 |         CONCAT(STRING('->'),
45 |           allele1Seq)) AS mutation1,
46 |       CONCAT(reference,
47 |         CONCAT(STRING('->'),
48 |           allele2Seq)) AS mutation2,
49 |     FROM
50 |       [google.com:biggene:pgp.cgi_variants]
51 |     WHERE
52 |       # WHERE varType = 'snp' not correct since a row with both an indel
53 |       # and a snp will be varType 'complex'
54 |       reference != '='
55 |       AND LENGTH(reference) = 1
56 |       )
57 |   GROUP BY
58 |     sample_id)
59 | ORDER BY
60 |   titv DESC
61 | 


--------------------------------------------------------------------------------
/pgp/sql/schema-comparisons/klotho-gvcf-expanded.sql:
--------------------------------------------------------------------------------
 1 | # Sample level data for Klotho variant rs9536314 for use in the "amazing
 2 | # intelligence of PGP participants" data story, specifically joining two 
 3 | # tables to compare the different encodings.
 4 | SELECT
 5 |   cgi.sample_id,
 6 |   chromosome,
 7 |   locusBegin,
 8 |   locusEnd,
 9 |   reference,
10 |   allele1Seq,
11 |   allele2Seq,
12 |   contig_name,
13 |   start_pos,
14 |   end_pos,
15 |   END,
16 |   ref,
17 |   alt,
18 |   gvcf.sample_id,
19 |   genotype
20 | FROM
21 |   [google.com:biggene:pgp.cgi_variants] AS cgi
22 |   left OUTER JOIN
23 |   (
24 |   SELECT
25 |     contig_name,
26 |     start_pos,
27 |     end_pos,
28 |     END,
29 |     ref,
30 |     alt,
31 |     sample_id,
32 |     genotype
33 |   FROM
34 |     FLATTEN(
35 |     SELECT
36 |       contig_name,
37 |       start_pos,
38 |       end_pos,
39 |       END,
40 |       reference_bases AS ref,
41 |       GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt,
42 |       call.callset_name AS sample_id,
43 |       GROUP_CONCAT(STRING(call.genotype),
44 |         '/') WITHIN call AS genotype,
45 |     FROM
46 |       [google.com:biggene:test.pgp_gvcf_variants_expanded2]
47 |     WHERE
48 |       contig_name = '13'
49 |       AND start_pos == 33628138
50 |       ,
51 |       call)) AS gvcf
52 | ON
53 |   cgi.sample_id = gvcf.sample_id
54 | WHERE
55 |   chromosome = "chr13"
56 |   AND locusBegin <= 33628137
57 |   AND locusEnd >= 33628138
58 | # Skip the genomes we were unable to convert to VCF/gVCF
59 | OMIT RECORD IF 
60 |   cgi.sample_id = 'huEDF7DA' OR cgi.sample_id = 'hu34D5B9'
61 | ORDER BY
62 |   cgi.sample_id
63 | 


--------------------------------------------------------------------------------
/pgp/sql/schema-comparisons/klotho-gvcf.sql:
--------------------------------------------------------------------------------
 1 | # Sample level data for Klotho variant rs9536314 for use in the "amazing
 2 | # intelligence of PGP participants" data story, specifically joining two 
 3 | # tables to compare the different encodings.
 4 | SELECT
 5 |   cgi.sample_id,
 6 |   chromosome,
 7 |   locusBegin,
 8 |   locusEnd,
 9 |   reference,
10 |   allele1Seq,
11 |   allele2Seq,
12 |   contig_name,
13 |   start_pos,
14 |   end_pos,
15 |   END,
16 |   ref,
17 |   alt,
18 |   gvcf.sample_id,
19 |   genotype
20 | FROM
21 |   [google.com:biggene:pgp.cgi_variants] AS cgi
22 |   left OUTER JOIN
23 |   (
24 |   SELECT
25 |     contig_name,
26 |     start_pos,
27 |     end_pos,
28 |     END,
29 |     ref,
30 |     alt,
31 |     sample_id,
32 |     genotype
33 |   FROM
34 |     FLATTEN(
35 |     SELECT
36 |       contig_name,
37 |       start_pos,
38 |       end_pos,
39 |       END,
40 |       reference_bases AS ref,
41 |       GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt,
42 |       call.callset_name AS sample_id,
43 |       GROUP_CONCAT(STRING(call.genotype),
44 |         '/') WITHIN call AS genotype,
45 |     FROM
46 |       [google.com:biggene:pgp.gvcf_variants]
47 |     WHERE
48 |       contig_name = '13'
49 |       AND start_pos <= 33628138
50 |       AND (end_pos >= 33628139
51 |         OR END >= 33628139)
52 |       ,
53 |       call)) AS gvcf
54 | ON
55 |   cgi.sample_id = gvcf.sample_id
56 | WHERE
57 |   chromosome = "chr13"
58 |   AND locusBegin <= 33628137
59 |   AND locusEnd >= 33628138
60 |   # Skip the genomes we were unable to convert to VCF/gVCF
61 | OMIT RECORD IF 
62 |   cgi.sample_id = 'huEDF7DA' OR cgi.sample_id = 'hu34D5B9'
63 | ORDER BY
64 |   cgi.sample_id
65 | 


--------------------------------------------------------------------------------
/pgp/sql/gvcf_variants/allelic-frequency.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright 2014 Google Inc. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | """Run allelic frequency upon one chromosome at a time, appending to the result table."""
18 | 
19 | import string, subprocess
20 | 
21 | chromosomes = range(1,23)
22 | chromosomes.extend(['X', 'Y', 'M'])
23 | 
24 | with open ("./allelic-frequency-chr1.sql", "r") as myfile:
25 |   query=myfile.read().replace('"', '\\"')
26 | 
27 | for chrom in chromosomes:
28 |   q = string.replace(query, "WHERE contig_name = '1'", "WHERE contig_name = '%s'" % chrom)
29 |   cmd = [
30 |         'bq', 
31 |         '--project_id', 'google.com:biggene',
32 |         '--nosync',
33 |         'query',
34 |         '--allow_large_results',
35 |         '--append_table',
36 |         '--destination_table', 'pgp_analysis_results.gvcf_variants_allelic_frequency',
37 |         '--batch', '"' + q + '"']
38 |   print " ".join(cmd)
39 |   print subprocess.check_output(" ".join(cmd),
40 |                                 stderr=subprocess.STDOUT,
41 |                                 shell=True)
42 | 
43 | 


--------------------------------------------------------------------------------
/pgp/sql/cgi_variants/allelic-frequency.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright 2014 Google Inc. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | """Run allelic frequency upon one chromosome at a time, appending to the result table."""
18 | 
19 | import string, subprocess
20 | 
21 | chromosomes = range(1,23)
22 | chromosomes.extend(['X', 'Y', 'M'])
23 | 
24 | with open ("./allelic-frequency-chr1.sql", "r") as myfile:
25 |   query=myfile.read().replace('"', '\\"')
26 | 
27 | for chrom in chromosomes:
28 |   q = string.replace(query, "WHERE chromosome = 'chr1'", "WHERE chromosome = 'chr%s'" % chrom)
29 |   cmd = [
30 |         'bq', 
31 |         '--project_id', 'google.com:biggene',
32 |         '--nosync',
33 |         'query',
34 |         '--allow_large_results',
35 |         '--append_table',
36 |         '--destination_table', 'pgp_analysis_results.cgi_variants_allelic_frequency',
37 |         '--batch', '"' + q + '"']
38 |   print " ".join(cmd)
39 |   print subprocess.check_output(" ".join(cmd),
40 |                                 stderr=subprocess.STDOUT,
41 |                                 shell=True)
42 | 
43 | 


--------------------------------------------------------------------------------
/1000genomes/data-stories/reproducing-vcfstats/vcfstats-output/stats.shared:
--------------------------------------------------------------------------------
  1 | #Shared SNPs	Frequency
  2 | 0	10
  3 | 1	243
  4 | 2	103
  5 | 3	45
  6 | 4	40
  7 | 5	27
  8 | 6	21
  9 | 7	19
 10 | 8	24
 11 | 9	10
 12 | 10	14
 13 | 11	17
 14 | 12	4
 15 | 13	3
 16 | 14	8
 17 | 15	5
 18 | 16	2
 19 | 17	5
 20 | 18	4
 21 | 19	4
 22 | 20	3
 23 | 21	2
 24 | 22	4
 25 | 23	1
 26 | 24	8
 27 | 25	11
 28 | 26	5
 29 | 27	4
 30 | 28	3
 31 | 29	4
 32 | 30	2
 33 | 31	7
 34 | 32	7
 35 | 33	5
 36 | 34	3
 37 | 35	1
 38 | 39	1
 39 | 42	7
 40 | 43	2
 41 | 44	1
 42 | 45	1
 43 | 46	1
 44 | 47	1
 45 | 48	1
 46 | 49	1
 47 | 50	2
 48 | 51	1
 49 | 56	1
 50 | 57	2
 51 | 60	1
 52 | 61	2
 53 | 62	2
 54 | 73	4
 55 | 76	1
 56 | 80	3
 57 | 81	4
 58 | 82	1
 59 | 94	1
 60 | 95	1
 61 | 105	1
 62 | 107	1
 63 | 121	1
 64 | 124	1
 65 | 125	1
 66 | 141	1
 67 | 150	1
 68 | 166	1
 69 | 168	2
 70 | 169	1
 71 | 170	1
 72 | 174	1
 73 | 176	1
 74 | 195	1
 75 | 197	1
 76 | 198	2
 77 | 200	2
 78 | 207	1
 79 | 218	1
 80 | 222	1
 81 | 252	1
 82 | 259	1
 83 | 268	1
 84 | 269	1
 85 | 314	1
 86 | 409	1
 87 | 415	1
 88 | 427	1
 89 | 431	1
 90 | 464	1
 91 | 481	1
 92 | 492	1
 93 | 498	1
 94 | 522	6
 95 | 523	1
 96 | 527	1
 97 | 528	1
 98 | 532	2
 99 | 534	1
100 | 535	2
101 | 536	6
102 | 537	5
103 | 538	7
104 | 539	3
105 | 540	2
106 | 541	1
107 | 549	1
108 | 555	1
109 | 557	1
110 | 561	1
111 | 562	7
112 | 563	5
113 | 566	2
114 | 569	1
115 | 570	2
116 | 573	1
117 | 574	1
118 | 576	1
119 | 577	2
120 | 578	13
121 | 579	8
122 | 580	4
123 | 581	2
124 | 582	6
125 | 584	1
126 | 611	1
127 | 613	1
128 | 685	1
129 | 698	1
130 | 703	1
131 | 704	1
132 | 718	1
133 | 720	4
134 | 721	1
135 | 728	2
136 | 738	2
137 | 740	1
138 | 741	1
139 | 742	1
140 | 743	2
141 | 745	1
142 | 783	1
143 | 1088	1
144 | 1091	1
145 | 


--------------------------------------------------------------------------------
/1000genomes/sql/reproducing-allelic-frequencies/reproducing-allelic-frequency.sql:
--------------------------------------------------------------------------------
 1 | # The following query computes the allelic frequency for BRCA1 variants in the
 2 | # 1,000 Genomes dataset and also includes the pre-computed value from the dataset.
 3 | SELECT
 4 |   reference_name,
 5 |   start,
 6 |   reference_bases,
 7 |   alternate_bases,
 8 |   SUM(ref_count)+SUM(alt_count) AS num_sample_alleles,
 9 |   SUM(ref_count) AS ref_cnt,
10 |   SUM(alt_count) AS alt_cnt,
11 |   SUM(ref_count)/(SUM(ref_count)+SUM(alt_count)) AS ref_freq,
12 |   SUM(alt_count)/(SUM(ref_count)+SUM(alt_count)) AS alt_freq,
13 |   alt_freq_from_1KG
14 | FROM (
15 |   SELECT
16 |     reference_name,
17 |     start,
18 |     reference_bases,
19 |     alternate_bases,
20 |     alt,
21 |     SUM(INTEGER(0 = call.genotype)) WITHIN RECORD AS ref_count,
22 |     SUM(INTEGER(alt = call.genotype)) WITHIN RECORD AS alt_count,
23 |     alt_freq_from_1KG
24 |   FROM
25 |     FLATTEN(
26 |       FLATTEN((
27 |         SELECT
28 |           reference_name,
29 |           start,
30 |           reference_bases,
31 |           alternate_bases,
32 |           POSITION(alternate_bases) AS alt,
33 |           af AS alt_freq_from_1KG,
34 |           call.call_set_name,
35 |           call.genotype,
36 |         FROM
37 |           [genomics-public-data:1000_genomes.variants]
38 |         WHERE
39 |           reference_name = '17'
40 |           AND start BETWEEN 41196311
41 |           AND 41277499
42 |           AND vt='SNP'
43 |           ),
44 |         call),
45 |       alt))
46 | GROUP BY
47 |   reference_name,
48 |   start,
49 |   reference_bases,
50 |   alternate_bases,
51 |   alt,
52 |   alt_freq_from_1KG
53 | ORDER BY
54 |   reference_name,
55 |   start,
56 |   reference_bases,
57 |   alt,
58 |   alternate_bases
59 | 


--------------------------------------------------------------------------------
/1000genomes/sql/allelic-frequency-by-gender.sql:
--------------------------------------------------------------------------------
 1 | # The following query computes the allelic frequency for BRCA1 variants in the 
 2 | # 1,000 Genomes dataset further classified by gender from the phenotypic data.
 3 | SELECT
 4 |   reference_name,
 5 |   start,
 6 |   gender,
 7 |   reference_bases,
 8 |   alternate_bases
 9 |   alt,
10 |   SUM(ref_count)+SUM(alt_count) AS num_sample_alleles,
11 |   SUM(ref_count) AS ref_cnt,
12 |   SUM(alt_count) AS alt_cnt,
13 |   SUM(ref_count)/(SUM(ref_count)+SUM(alt_count)) AS ref_freq,
14 |   SUM(alt_count)/(SUM(ref_count)+SUM(alt_count)) AS alt_freq,
15 | FROM (
16 |   SELECT
17 |     reference_name,
18 |     start,
19 |     gender,
20 |     reference_bases,
21 |     alternate_bases,
22 |     alt,
23 |     SUM(INTEGER(0 = call.genotype)) WITHIN RECORD AS ref_count,
24 |     SUM(INTEGER(alt = call.genotype)) WITHIN RECORD AS alt_count
25 |   FROM
26 |     FLATTEN(FLATTEN((
27 |         SELECT
28 |           reference_name,
29 |           start,
30 |           reference_bases,
31 |           alternate_bases,
32 |           POSITION(alternate_bases) AS alt,
33 |           call.call_set_name,
34 |           call.genotype,
35 |         FROM
36 |           [genomics-public-data:1000_genomes.variants]
37 |         WHERE
38 |           reference_name = '17'
39 |           AND start BETWEEN 41196311
40 |           AND 41277499
41 |           AND vt='SNP'
42 |           ),
43 |         call),
44 |       alt) AS g
45 |   JOIN
46 |     [genomics-public-data:1000_genomes.sample_info] p
47 |   ON
48 |     g.call.call_set_name = p.sample)
49 | GROUP BY
50 |   reference_name,
51 |   start,
52 |   gender,
53 |   reference_bases,
54 |   alternate_bases,
55 |   alt
56 | ORDER BY
57 |   reference_name,
58 |   start,
59 |   gender,
60 |   reference_bases,
61 |   alt,
62 |   alternate_bases
63 | 


--------------------------------------------------------------------------------
/1000genomes/sql/allelic-frequency-by-ethnicity.sql:
--------------------------------------------------------------------------------
 1 | # The following query computes the allelic frequency for BRCA1 variants in the
 2 | # 1,000 Genomes dataset further classified by ethnicity from the phenotypic data.
 3 | SELECT
 4 |   reference_name,
 5 |   start,
 6 |   population,
 7 |   reference_bases,
 8 |   alternate_bases
 9 |   alt,
10 |   SUM(ref_count)+SUM(alt_count) AS num_sample_alleles,
11 |   SUM(ref_count) AS ref_cnt,
12 |   SUM(alt_count) AS alt_cnt,
13 |   SUM(ref_count)/(SUM(ref_count)+SUM(alt_count)) AS ref_freq,
14 |   SUM(alt_count)/(SUM(ref_count)+SUM(alt_count)) AS alt_freq,
15 | FROM (
16 |   SELECT
17 |     reference_name,
18 |     start,
19 |     population,
20 |     reference_bases,
21 |     alternate_bases,
22 |     alt,
23 |     SUM(INTEGER(0 = call.genotype)) WITHIN RECORD AS ref_count,
24 |     SUM(INTEGER(alt = call.genotype)) WITHIN RECORD AS alt_count
25 |   FROM
26 |     FLATTEN(FLATTEN((
27 |         SELECT
28 |           reference_name,
29 |           start,
30 |           reference_bases,
31 |           alternate_bases,
32 |           POSITION(alternate_bases) AS alt,
33 |           call.call_set_name,
34 |           call.genotype,
35 |         FROM
36 |           [genomics-public-data:1000_genomes.variants]
37 |         WHERE
38 |           reference_name = '17'
39 |           AND start BETWEEN 41196311
40 |           AND 41277499
41 |           AND vt='SNP'
42 |           ),
43 |         call),
44 |       alt) AS g
45 |   JOIN
46 |     [genomics-public-data:1000_genomes.sample_info] p
47 |   ON
48 |     g.call.call_set_name = p.sample)
49 | GROUP BY
50 |   reference_name,
51 |   start,
52 |   population,
53 |   reference_bases,
54 |   alternate_bases,
55 |   alt
56 | ORDER BY
57 |   reference_name,
58 |   start,
59 |   population,
60 |   reference_bases,
61 |   alt,
62 |   alternate_bases
63 | 


--------------------------------------------------------------------------------
/pgp/provenance/gvcf-expand-mapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright 2014 Google Inc. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | """A mapper for expansion of gVCF data.
18 | """
19 | 
20 | import json
21 | import sys
22 | 
23 | from gvcf_expander import GvcfExpander
24 | 
25 | 
26 | def main():
27 |   """Entry point to the script."""
28 | 
29 |   # Basic parsing of command line arguments to allow a filename
30 |   # to be passed when running this code in the debugger.
31 |   file_handle = sys.stdin
32 |   if 2 <= len(sys.argv):
33 |     file_handle = open(sys.argv[1], "r")
34 | 
35 |   expander = GvcfExpander()
36 | 
37 |   line = file_handle.readline()
38 |   while line:
39 |     line = line.strip()
40 |     if not line:
41 |       line = file_handle.readline()
42 |       continue
43 | 
44 |     fields = json.loads(line)
45 | 
46 |     pairs = expander.map(fields=fields)
47 |     for pair in pairs:
48 |       emit(pair.k, pair.v)
49 | 
50 |     line = file_handle.readline()
51 | 
52 | 
53 | def emit(key, fields):
54 |   """Emits a key/value pair to stdout.
55 | 
56 |   Args:
57 |     key: (string)
58 |     fields: (dictionary)
59 | 
60 |   Returns: n/a
61 | 
62 |   Side Effects:
63 |     a VCF line is written to stdout
64 |   """
65 |   print "%s\t%s" % (key, json.dumps(fields))
66 | 
67 | 
68 | if __name__ == "__main__":
69 |   main()
70 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | bigquery-examples
 2 | =================
 3 | 
 4 | The data stories and queries in this repository demonstrate working with genomic data via [Google BigQuery](https://cloud.google.com/bigquery/).  All examples are built upon public datasets.
 5 | 
 6 | Have other data stories you would like to see here?  Have any data stories you would like to *share*?  Have *corrections to the biology* covered in this material?  Have query *simplifications* or *speed improvements*?  Let us know by [filing an issue](https://github.com/googlegenomics/bigquery-examples/issues) or [contacting us directly](mailto:google-genomics-contact@googlegroups.com).
 7 | 
 8 | Getting Started
 9 | -----------------
10 | 
11 | If you are new to BigQuery, start here instead: [Analyze Variants Using BigQuery](https://cloud.google.com/genomics/v1/analyze-variants).
12 | 
13 | Otherwise, navigate through the tree of content in this repository.  You will find queries, RMarkdown, rendered analyses, and provenance details.
14 | 
15 | Loading your own Variant Data into BigQuery
16 | -------------------------------------------
17 | 
18 | After trying these queries on public data, you can [load your own variant data into BigQuery](https://cloud.google.com/genomics/v1/load-variants).
19 | 
20 | For other types of data, such as variant annotations, see [Preparing Data for BigQuery](https://cloud.google.com/bigquery/preparing-data-for-bigquery) and also [BigQuery in Practice : Loading Data Sets That are Terabytes and Beyond](https://cloud.google.com/developers/articles/bigquery-in-practice) for more detail.
21 | 
22 | The mailing list
23 | ----------------
24 | 
25 | The [Google Genomics Discuss mailing list](https://groups.google.com/forum/#!forum/google-genomics-discuss) is a good
26 | way to sync up with other people who use googlegenomics including the core developers. You can subscribe
27 | by sending an email to ``google-genomics-discuss+subscribe@googlegroups.com`` or just post using
28 | the [web forum page](https://groups.google.com/forum/#!forum/google-genomics-discuss).
29 | 


--------------------------------------------------------------------------------
/pgp/sql/gvcf_variants/allele-count.sql:
--------------------------------------------------------------------------------
 1 | # Count the occurence of each variant allele across all participants in the
 2 | # dataset.  This returns a large result so be sure to materialize it into a
 3 | # table for subsequent use. 
 4 | #
 5 | # Note that the new BigQuery feature of user-defined javascript
 6 | # functions is in limited preview.  For more info, see
 7 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377
 8 | SELECT
 9 |   contig_name,
10 |   start_pos,
11 |   # This 'bin' can be use in subsequent interval JOINs
12 |   INTEGER(FLOOR(start_pos / 5000)) AS bin,
13 |   reference_bases,
14 |   alternate_bases,
15 |   SUM(alternate_allele_count) AS alternate_allele_count,
16 | FROM (
17 |   SELECT contig_name, start_pos, reference_bases, alternate_bases, alt_count
18 |   FROM js(
19 |     [google.com:biggene:pgp.gvcf_variants],
20 |     contig_name, start_pos, reference_bases, alternate_bases, call.genotype,
21 |       "[{name: 'contig_name', type: 'string'},
22 |         {name: 'start_pos', type: 'integer'},
23 |         {name: 'reference_bases', type: 'string'},
24 |         {name: 'alternate_bases', type: 'string'},
25 |         {name: 'alternate_allele_count', type: 'integer'}]",
26 |       "function(r, emit) {
27 |          for(var a in r.alternate_bases) {
28 |            var alt_gt = a + 1;
29 |            var alt_count = 0;
30 |            for(var c in r.call) {
31 |              for(var g in r.call[c].genotype) {
32 |                if(alt_gt == r.call[c].genotype[g]) {
33 |                  alt_count++;
34 |                }
35 |              }
36 |            }
37 |            // Emit one record per alt
38 |            emit({
39 |              contig_name: r.contig_name,
40 |              start_pos: r.start_pos,
41 |              reference_bases: r.reference_bases,
42 |              alternate_bases: r.alternate_bases[a],
43 |              alternate_allele_count: alt_count
44 |            });
45 |          }
46 |        }"))
47 | GROUP EACH BY
48 |   contig_name,
49 |   start_pos,
50 |   bin,
51 |   reference_bases,
52 |   alternate_bases
53 | 


--------------------------------------------------------------------------------
/pgp/sql/schema-comparisons/sample-call-counts.sql:
--------------------------------------------------------------------------------
 1 | # Sample call counts for the PGP data encoded several different ways.  
 2 | # NOTE: table pgp.variants was left out of this example since its more trouble
 3 | # than its worth to parse the GT field into its components. 
 4 | SELECT
 5 |   sample_id,
 6 |   num_records,
 7 |   num_variant_alleles,
 8 |   dataset
 9 | FROM
10 |   (
11 |   SELECT
12 |     sample_id,
13 |     COUNT(sample_id) AS num_records,
14 |     INTEGER(SUM(allele1_is_variant + allele2_is_variant)) AS num_variant_alleles,
15 |     'cgi_variants' AS dataset
16 |   FROM (
17 |     SELECT
18 |       sample_id,
19 |       allele1Seq != reference
20 |       AND allele1Seq != '='
21 |       AND allele1Seq != '?' AS allele1_is_variant,
22 |       allele2Seq != reference
23 |       AND allele2Seq != '='
24 |       AND allele2Seq != '?' AS allele2_is_variant,
25 |     FROM
26 |       [google.com:biggene:pgp.cgi_variants]
27 |       # Skip the genomes we were unable to convert to VCF/gVCF
28 |     OMIT
29 |       RECORD IF
30 |       sample_id = 'huEDF7DA'
31 |       OR sample_id = 'hu34D5B9')
32 |   GROUP BY
33 |     sample_id),
34 |   (
35 |   SELECT
36 |     sample_id,
37 |     COUNT(sample_id) AS num_records,
38 |     INTEGER(SUM(num_variant_alleles)) AS num_variant_alleles,
39 |     'gvcf_variants' AS dataset
40 |   FROM (
41 |     SELECT
42 |       call.callset_name AS sample_id,
43 |       SUM(call.genotype > 0) WITHIN call AS num_variant_alleles,
44 |     FROM
45 |       [google.com:biggene:pgp.gvcf_variants])
46 |   GROUP BY
47 |     sample_id),
48 |   (
49 |   SELECT
50 |     sample_id,
51 |     COUNT(sample_id) AS num_records,
52 |     INTEGER(SUM(num_variant_alleles)) AS num_variant_alleles,
53 |     'gvcf_variants_expanded' AS dataset
54 |   FROM
55 |     (
56 |     SELECT
57 |       call.callset_name AS sample_id,
58 |       SUM(call.genotype > 0) WITHIN call AS num_variant_alleles,
59 |     FROM
60 |       [google.com:biggene:test.pgp_gvcf_variants_expanded2])
61 |   GROUP BY
62 |     sample_id)
63 | ORDER BY
64 |   sample_id,
65 |   dataset
66 | 


--------------------------------------------------------------------------------
/1000genomes/sql/gender-het-hom-ratio.sql:
--------------------------------------------------------------------------------
 1 | # The following query uses the homozygous and heterozygous variant counts within
 2 | # chromosome X to help determine whether the gender phenotype values are correct
 3 | # for the samples.
 4 | SELECT
 5 |   sample_id,
 6 |   gender,
 7 |   reference_name,
 8 |   (hom_AA_count + het_RA_count + hom_RR_count) AS all_callable_sites,
 9 |   hom_AA_count,
10 |   het_RA_count,
11 |   hom_RR_count,
12 |   (hom_AA_count + het_RA_count) AS all_snvs,
13 |   ROUND((het_RA_count/(hom_AA_count + het_RA_count))*1000)/1000 AS perct_het_alt_in_snvs,
14 |   ROUND((hom_AA_count/(hom_AA_count + het_RA_count))*1000)/1000 AS perct_hom_alt_in_snvs
15 | FROM
16 |   (
17 |   SELECT
18 |     reference_name,
19 |     sample_id,
20 |     SUM(IF(0 = first_allele
21 |         AND 0 = second_allele,
22 |         1,
23 |         0)) AS hom_RR_count,
24 |     SUM(IF(first_allele = second_allele
25 |         AND first_allele > 0,
26 |         1,
27 |         0)) AS hom_AA_count,
28 |     SUM(IF((first_allele != second_allele OR second_allele IS NULL)
29 |         AND (first_allele > 0
30 |           OR second_allele > 0),
31 |         1,
32 |         0)) AS het_RA_count
33 |   FROM (
34 |     SELECT
35 |       reference_name,
36 |       call.call_set_name AS sample_id,
37 |       NTH(1,
38 |         call.genotype) WITHIN call AS first_allele,
39 |       NTH(2,
40 |         call.genotype) WITHIN call AS second_allele,
41 |     FROM
42 |       [genomics-public-data:1000_genomes.variants]
43 |     WHERE
44 |       reference_name = 'X'
45 |       AND vt = 'SNP'
46 |       AND start NOT BETWEEN 59999
47 |       AND 2699519
48 |       AND start NOT BETWEEN 154931042
49 |       AND 155260559)
50 |   GROUP BY
51 |     sample_id,
52 |     reference_name
53 |     ) AS g
54 | JOIN
55 |   [genomics-public-data:1000_genomes.sample_info] p
56 | ON
57 |   g.sample_id = p.sample
58 | GROUP BY
59 |   sample_id,
60 |   gender,
61 |   reference_name,
62 |   all_callable_sites,
63 |   hom_AA_count,
64 |   het_RA_count,
65 |   hom_RR_count,
66 |   all_snvs,
67 |   perct_het_alt_in_snvs,
68 |   perct_hom_alt_in_snvs
69 | ORDER BY
70 |   perct_het_alt_in_snvs DESC,
71 |   sample_id
72 | 
73 | 


--------------------------------------------------------------------------------
/pgp/provenance/gvcf-expand-reducer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright 2014 Google Inc. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | """A reducer for expansion of gVCF data.
18 | """
19 | 
20 | import json
21 | import os
22 | import sys
23 | 
24 | from gvcf_expander import GvcfExpander
25 | from gvcf_expander import Pair
26 | 
27 | FILTER_ENV_KEY = "FILTER_REF_MATCHES"
28 | 
29 | 
30 | def main():
31 |   """Entry point to the script."""
32 | 
33 |   # Basic parsing of command line arguments to allow a filename
34 |   # to be passed when running this code in the debugger.
35 |   file_handle = sys.stdin
36 |   if 2 <= len(sys.argv):
37 |     file_handle = open(sys.argv[1], "r")
38 | 
39 |   if FILTER_ENV_KEY in os.environ:
40 |     expander = GvcfExpander(filter_ref_matches=True)
41 |   else:
42 |     expander = GvcfExpander()
43 | 
44 |   line = file_handle.readline()
45 |   while line:
46 |     line = line.strip()
47 |     if not line:
48 |       line = file_handle.readline()
49 |       continue
50 | 
51 |     (key, value) = line.split("\t")
52 |     fields = json.loads(value)
53 |     results = expander.reduce(pair=Pair(key, fields))
54 | 
55 |     for result in results:
56 |       emit(result)
57 | 
58 |     line = file_handle.readline()
59 | 
60 |   results = expander.finalize()
61 | 
62 |   for result in results:
63 |     emit(result)
64 | 
65 | 
66 | def emit(fields):
67 |   """Emits a reduced value to stdout.
68 | 
69 |   Args:
70 |     fields: (dict)
71 | 
72 |   Returns: n/a
73 | 
74 |   Side Effects:
75 |     a value is written to stdout
76 |   """
77 |   print "%s" % (json.dumps(fields))
78 | 
79 | 
80 | if __name__ == "__main__":
81 |   main()
82 | 


--------------------------------------------------------------------------------
/1000genomes/sql/reproducing-allelic-frequencies/reproducing-allelic-frequency-by-ethnicity.sql:
--------------------------------------------------------------------------------
 1 | # The following query computes the allelic frequency for BRCA1 variants in the
 2 | # 1,000 Genomes dataset further classified by ethnicity from the phenotypic data
 3 | # and also includes the pre-computed value from the dataset.
 4 | SELECT
 5 |   reference_name,
 6 |   start,
 7 |   super_population,
 8 |   reference_bases,
 9 |   alternate_bases,
10 |   SUM(ref_count)+SUM(alt_count) AS num_sample_alleles,
11 |   SUM(ref_count) AS sample_allele_ref_cnt,
12 |   SUM(alt_count) AS sample_allele_alt_cnt,
13 |   SUM(ref_count)/(SUM(ref_count)+SUM(alt_count)) AS ref_freq,
14 |   SUM(alt_count)/(SUM(ref_count)+SUM(alt_count)) AS alt_freq,
15 |   alt_freq_from_1KG
16 | FROM (
17 |   SELECT
18 |     reference_name,
19 |     start,
20 |     super_population,
21 |     reference_bases,
22 |     alternate_bases,
23 |     alt,
24 |     SUM(INTEGER(0 = call.genotype)) WITHIN RECORD AS ref_count,
25 |     SUM(INTEGER(alt = call.genotype)) WITHIN RECORD AS alt_count,
26 |     CASE
27 |     WHEN super_population =  'EAS'
28 |     THEN  asn_af
29 |     WHEN super_population=  'EUR'
30 |     THEN eur_af
31 |     WHEN super_population = 'AFR'
32 |     THEN afr_af
33 |     WHEN super_population = 'AMR'
34 |     THEN amr_af
35 |     END AS alt_freq_from_1KG
36 |   FROM
37 |     FLATTEN(FLATTEN((
38 |         SELECT
39 |           reference_name,
40 |           start,
41 |           reference_bases,
42 |           alternate_bases,
43 |           POSITION(alternate_bases) AS alt,
44 |           call.call_set_name,
45 |           call.genotype,
46 |           afr_af,
47 |           amr_af,
48 |           asn_af,
49 |           eur_af,
50 |         FROM
51 |           [genomics-public-data:1000_genomes.variants]
52 |         WHERE
53 |           reference_name = '17'
54 |           AND start BETWEEN 41196311
55 |           AND 41277499
56 |           AND vt='SNP'
57 |           ),
58 |         call),
59 |       alt) AS g
60 |   JOIN
61 |     [genomics-public-data:1000_genomes.sample_info] p
62 |   ON
63 |     g.call.call_set_name = p.sample)
64 | GROUP BY
65 |   reference_name,
66 |   start,
67 |   super_population,
68 |   reference_bases,
69 |   alternate_bases,
70 |   alt_freq_from_1KG
71 | ORDER BY
72 |   reference_name,
73 |   start,
74 |   super_population
75 | 


--------------------------------------------------------------------------------
/pgp/sql/gvcf_variants/ti-tv-ratio.sql:
--------------------------------------------------------------------------------
 1 | # Compute the Ti/Tv ratio for each participant in the PGP dataset.  A user-defined
 2 | # function is used here since its difficult in SQL to join the genotype array in
 3 | # each call with alternate_bases at the variant level.
 4 | #
 5 | # Note that the new BigQuery feature of user-defined javascript
 6 | # functions is in limited preview.  For more info, see
 7 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377
 8 | SELECT
 9 |   sample_id,
10 |   transitions,
11 |   transversions,
12 |   transitions/transversions AS titv
13 | FROM (
14 |   SELECT
15 |     sample_id,
16 |     SUM(IF(mutation IN ('A->G',
17 |           'G->A',
18 |           'C->T',
19 |           'T->C'),
20 |         1,
21 |         0)) AS transitions,
22 |     SUM(IF(mutation IN ('A->C',
23 |           'C->A',
24 |           'G->T',
25 |           'T->G',
26 |           'A->T',
27 |           'T->A',
28 |           'C->G',
29 |           'G->C'),
30 |         1,
31 |         0)) AS transversions,
32 |   FROM (
33 |     SELECT sample_id, mutation
34 |       FROM js(
35 |       [google.com:biggene:pgp.gvcf_variants],
36 |       reference_bases, alternate_bases, call.callset_name, call.genotype,
37 |         "[{name: 'sample_id', type: 'string'},
38 |           {name: 'mutation', type: 'string'}]",
39 |         "function(r, emit) {
40 |            var hasSNP = false;
41 |            var isSNP = [false];
42 |            for(var i in r.alternate_bases) {
43 |               if(1 == r.alternate_bases[i].length) {
44 |                 isSNP[isSNP.length] = true;
45 |                 hasSNP = true;
46 |               }
47 |               else {
48 |                 isSNP[isSNP.length] = false;
49 |               }
50 |            }
51 |            if (hasSNP && 1 == r.reference_bases.length) { 
52 |              for(var i in r.call) {
53 |                for(var j in r.call[i].genotype) {
54 |                  if(0 < r.call[i].genotype[j] && isSNP[r.call[i].genotype[j]]) {
55 |                    emit({
56 |                     sample_id: r.call[i].callset_name,
57 |                     mutation: r.reference_bases + '->' + r.alternate_bases[r.call[i].genotype[j] - 1] 
58 |                    });
59 |                  }
60 |                }
61 |              }
62 |            }
63 |          }"))
64 |   GROUP BY
65 |     sample_id)
66 | ORDER BY
67 |   titv DESC
68 | 


--------------------------------------------------------------------------------
/pgp/sql/gvcf_variants_expanded/ti-tv-ratio.sql:
--------------------------------------------------------------------------------
 1 | # Compute the Ti/Tv ratio for each participant in the PGP dataset.  A user-defined
 2 | # function is used here since its difficult in SQL to join the genotype array in
 3 | # each call with alternate_bases at the variant level.
 4 | #
 5 | # Note that the new BigQuery feature of user-defined javascript
 6 | # functions is in limited preview.  For more info, see
 7 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377
 8 | SELECT
 9 |   sample_id,
10 |   transitions,
11 |   transversions,
12 |   transitions/transversions AS titv
13 | FROM (
14 |   SELECT
15 |     sample_id,
16 |     SUM(IF(mutation IN ('A->G',
17 |           'G->A',
18 |           'C->T',
19 |           'T->C'),
20 |         1,
21 |         0)) AS transitions,
22 |     SUM(IF(mutation IN ('A->C',
23 |           'C->A',
24 |           'G->T',
25 |           'T->G',
26 |           'A->T',
27 |           'T->A',
28 |           'C->G',
29 |           'G->C'),
30 |         1,
31 |         0)) AS transversions,
32 |   FROM (
33 |     SELECT sample_id, mutation
34 |       FROM js(
35 |       [google.com:biggene:test.pgp_gvcf_variants_expanded],
36 |       reference_bases, alternate_bases, call.callset_name, call.genotype,
37 |         "[{name: 'sample_id', type: 'string'},
38 |           {name: 'mutation', type: 'string'}]",
39 |         "function(r, emit) {
40 |            var hasSNP = false;
41 |            var isSNP = [false];
42 |            for(var i in r.alternate_bases) {
43 |               if(1 == r.alternate_bases[i].length) {
44 |                 isSNP[isSNP.length] = true;
45 |                 hasSNP = true;
46 |               }
47 |               else {
48 |                 isSNP[isSNP.length] = false;
49 |               }
50 |            }
51 |            if (hasSNP && 1 == r.reference_bases.length) { 
52 |              for(var i in r.call) {
53 |                for(var j in r.call[i].genotype) {
54 |                  if(0 < r.call[i].genotype[j] && isSNP[r.call[i].genotype[j]]) {
55 |                    emit({
56 |                     sample_id: r.call[i].callset_name,
57 |                     mutation: r.reference_bases + '->' + r.alternate_bases[r.call[i].genotype[j] - 1] 
58 |                    });
59 |                  }
60 |                }
61 |              }
62 |            }
63 |          }")
64 |         )
65 |   GROUP BY
66 |     sample_id)
67 | ORDER BY
68 |   titv DESC
69 | 


--------------------------------------------------------------------------------
/pgp/sql/gvcf_variants/allelic-frequency-brca1-no-udf.sql:
--------------------------------------------------------------------------------
 1 | # The following query computes the allelic frequency for BRCA1 variants in the 
 2 | # PGP dataset _without_ using a user-defined function.
 3 | #
 4 | # Since without UDFs we cannot _count the other reference calls just assume
 5 | # the total number of alleles IS number of samples times 2 (thereby losing the
 6 | # distinction _between reference calls _and no-calls unfortunately)
 7 | SELECT
 8 |   contig_name,
 9 |   start_pos,
10 |   reference_bases,
11 |   alt AS allele,
12 |   (174 * 2) AS num_alleles_called,
13 |   ROUND(alt_allele_count / (174 * 2),
14 |     4) AS freq,
15 | FROM (
16 |   SELECT
17 |     contig_name,
18 |     start_pos,
19 |     reference_bases,
20 |     alt,
21 |     SUM(ref_allele_count) AS ref_allele_count,
22 |     SUM(alt_allele_count) AS alt_allele_count,
23 |     SUM(other_alt_allele_count) AS other_alt_allele_count,
24 |   FROM (
25 |     SELECT
26 |       contig_name,
27 |       start_pos,
28 |       reference_bases,
29 |       NTH(1,
30 |         alternate_bases) WITHIN RECORD AS alt,
31 |       SUM(IF(0 = call.genotype,
32 |           1,
33 |           0)) WITHIN RECORD AS ref_allele_count,
34 |       SUM(IF(1 = call.genotype,
35 |           1,
36 |           0)) WITHIN RECORD AS alt_allele_count,
37 |       SUM(IF(0 != call.genotype
38 |           AND 1 != call.genotype,
39 |           1,
40 |           0)) WITHIN RECORD AS other_alt_allele_count,
41 |     FROM
42 |       [google.com:biggene:pgp.gvcf_variants]
43 |     WHERE
44 |       reference_bases != 'N'
45 |       AND contig_name = '17'
46 |       AND start_pos BETWEEN 41196312
47 |       AND 41277500
48 |       ),
49 |     (
50 |     SELECT
51 |       contig_name,
52 |       start_pos,
53 |       reference_bases,
54 |       NTH(2,
55 |         alternate_bases) WITHIN RECORD AS alt,
56 |       SUM(IF(0 = call.genotype,
57 |           1,
58 |           0)) WITHIN RECORD AS ref_allele_count,
59 |       SUM(IF(2 = call.genotype,
60 |           1,
61 |           0)) WITHIN RECORD AS alt_allele_count,
62 |       SUM(IF(0 != call.genotype
63 |           AND 2 != call.genotype,
64 |           1,
65 |           0)) WITHIN RECORD AS other_alt_allele_count,
66 |     FROM
67 |       [google.com:biggene:pgp.gvcf_variants]
68 |     WHERE
69 |       reference_bases != 'N'
70 |       AND contig_name = '17'
71 |       AND start_pos BETWEEN 41196312
72 |       AND 41277500
73 |       )
74 |   WHERE
75 |     alt IS NOT NULL
76 |   GROUP BY
77 |     contig_name,
78 |     start_pos,
79 |     reference_bases,
80 |     alt
81 |     )


--------------------------------------------------------------------------------
/1000genomes/sql/shared-variant-counts-by-ethnicity.sql:
--------------------------------------------------------------------------------
 1 | #standardSQL
 2 | --
 3 | -- We'd like to see how the members of each super population share variation.
 4 | --
 5 | -- Let's generate a table where the records indicate:
 6 | --
 7 | -- For the variants that appear in a given super-population:
 8 | --  how many variants are singletons (not shared)?
 9 | --  how many variants are shared by exactly 2 individuals?
10 | --  how many variants are shared by exactly 3 individuals?
11 | --  etc ...
12 | --  how many variants are shared by all members of the super population?
13 | --
14 | -- The variants and counts are further partitioned by whether the variant is common or rare.
15 | --
16 | WITH
17 |   population_counts AS (
18 |   SELECT
19 |     super_population,
20 |     COUNT(population) AS super_population_count
21 |   FROM
22 |     `genomics-public-data.1000_genomes.sample_info`
23 |   WHERE
24 |     In_Phase1_Integrated_Variant_Set = TRUE
25 |   GROUP BY
26 |     super_population),
27 |   --
28 |   autosome_calls AS (
29 |   SELECT
30 |     reference_name,
31 |     start,
32 |     `end`,
33 |     reference_bases,
34 |     alternate_bases[ORDINAL(1)] AS alt,  -- 1000 Genomes is biallelic.
35 |     vt,
36 |     af IS NOT NULL
37 |     AND af >= 0.05 AS is_common_variant,
38 |     call.call_set_name,
39 |     super_population
40 |   FROM
41 |     `genomics-public-data.1000_genomes.variants` AS v, v.call AS call
42 |   JOIN
43 |     `genomics-public-data.1000_genomes.sample_info` AS p
44 |   ON
45 |     call.call_set_name = p.sample
46 |   WHERE
47 |     reference_name NOT IN ("X", "Y", "MT")
48 |     AND EXISTS (SELECT gt FROM UNNEST(call.genotype) gt WHERE gt > 0)),
49 |   --
50 |   super_population_autosome_variants AS (
51 |   SELECT
52 |     reference_name,
53 |     start,
54 |     `end`,
55 |     reference_bases,
56 |     alt,
57 |     vt,
58 |     super_population,
59 |     is_common_variant,
60 |     COUNT(call_set_name) AS num_samples
61 |   FROM
62 |     autosome_calls
63 |   GROUP BY
64 |     reference_name,
65 |     start,
66 |     `end`,
67 |     reference_bases,
68 |     alt,
69 |     vt,
70 |     super_population,
71 |     is_common_variant )
72 |   --
73 |   --
74 | SELECT
75 |   p.super_population AS super_population,
76 |   super_population_count,
77 |   is_common_variant,
78 |   num_samples,
79 |   num_samples / super_population_count AS percent_samples,
80 |   COUNT(1) AS num_variants_shared_by_this_many_samples
81 | FROM
82 |   super_population_autosome_variants AS v
83 | JOIN population_counts AS p
84 | ON
85 |   v.super_population = p.super_population
86 | GROUP BY
87 |   super_population,
88 |   super_population_count,
89 |   is_common_variant,
90 |   num_samples,
91 |   percent_samples
92 | ORDER BY
93 |   num_samples,
94 |   super_population,
95 |   is_common_variant
96 | 


--------------------------------------------------------------------------------
/pgp/sql/schema-comparisons/missingness-brca1.sql:
--------------------------------------------------------------------------------
 1 | # Missingness rate for variants within BRCA1.
 2 | SELECT
 3 |   vars.contig_name AS contig_name,
 4 |   vars.start_pos AS start_pos,
 5 |   reference_bases,
 6 |   variant_called_count,
 7 |   SUM(refs.called_count) AS reference_called_count,
 8 |   variant_called_count + SUM(refs.called_count) AS num_alleles_called_for_position,
 9 |   1 - ((variant_called_count + SUM(refs.called_count))/(172*2)) AS missingness_rate
10 | FROM (
11 |   # _JOIN our variant sample counts with the corresponding reference-matching blocks
12 |   SELECT
13 |     vars.contig_name,
14 |     vars.start_pos,
15 |     refs.start_pos,
16 |     vars.end_pos,
17 |     refs.END,
18 |     reference_bases,
19 |     variant_called_count,
20 |     refs.called_count
21 |   FROM (
22 |     # Constrain the left hand side of the _JOIN to reference-matching blocks
23 |     SELECT
24 |       contig_name,
25 |       start_pos,
26 |       END,
27 |       IF(alternate_bases IS NULL,
28 |         FALSE,
29 |         TRUE) AS is_variant_call,
30 |       SUM(call.genotype >= 0) WITHIN RECORD AS called_count,
31 |     FROM
32 |       [google.com:biggene:pgp.gvcf_variants]
33 |     WHERE
34 |       contig_name = '17'
35 |     HAVING
36 |       is_variant_call = FALSE) AS refs
37 |   JOIN (
38 |     # Constrain the right hand side of the _JOIN to variants
39 |     # _GROUP our variant sample counts together since a single SNP may be IN more than
40 |     # one row due 1 / 2 genotypes
41 |     SELECT
42 |       contig_name,
43 |       start_pos,
44 |       end_pos,
45 |       reference_bases,
46 |       SUM(called_count) AS variant_called_count,
47 |     FROM (
48 |         # _LIMIT the query to SNPs _ON chromosome 17 WITHIN BRCA1
49 |       SELECT
50 |         contig_name,
51 |         start_pos,
52 |         end_pos,
53 |         reference_bases,
54 |         LENGTH(reference_bases) AS ref_len,
55 |         MIN(LENGTH(alternate_bases)) WITHIN RECORD AS alt_len,
56 |         IF(alternate_bases IS NULL,
57 |           FALSE,
58 |           TRUE) AS is_variant_call,
59 |         SUM(call.genotype >= 0) WITHIN RECORD AS called_count,
60 |       FROM
61 |         [google.com:biggene:pgp.gvcf_variants]
62 |       WHERE
63 |         contig_name = '17'
64 |         AND start_pos BETWEEN 41196312
65 |         AND 41277500
66 |       HAVING
67 |         ref_len = 1
68 |         AND alt_len = 1
69 |         AND is_variant_call)
70 |     GROUP BY
71 |       contig_name,
72 |       start_pos,
73 |       end_pos,
74 |       reference_bases) AS vars
75 |   # The _JOIN criteria IS complicated since we are trying to see if a SNP overlaps an interval
76 |   ON
77 |     vars.contig_name = refs.contig_name
78 |   WHERE
79 |     refs.start_pos <= vars.start_pos
80 |     AND refs.END >= vars.end_pos
81 |     )
82 | GROUP BY
83 |   contig_name,
84 |   start_pos,
85 |   reference_bases,
86 |   variant_called_count
87 | 


--------------------------------------------------------------------------------
/pgp/sql/gvcf_variants_expanded/allelic-frequency.sql:
--------------------------------------------------------------------------------
 1 | # This is busted.  It over counts ref calls due in the GROUP BY operation.  It
 2 | # would work if we grouped all of the same variant into the same row prior to
 3 | # loading to BigQuery because then we would not need the GROUP BY operation.
 4 | #
 5 | # Note that the new BigQuery feature of user-defined javascript
 6 | # functions is in limited preview.  For more info, see
 7 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377
 8 | SELECT
 9 |   contig_name,
10 |   start_pos,
11 |   reference_bases,
12 |   alternate_bases,
13 |   ref_count + alt_count + other_count AS num_sample_alleles,
14 |   alt_count/(ref_count + alt_count + other_count) AS alt_freq,
15 | FROM (
16 |   SELECT
17 |     contig_name,
18 |     start_pos,
19 |     reference_bases,
20 |     alternate_bases,
21 |     SUM(alt_count) AS alt_count,
22 |     SUM(ref_count) AS ref_count,
23 |     SUM(other_count) AS other_count,
24 |   FROM (
25 |     SELECT contig_name, start_pos, reference_bases, alternate_bases, alt_count, ref_count, other_count
26 |       FROM js(
27 |       [google.com:biggene:test.pgp_gvcf_variants_expanded],
28 |       contig_name, start_pos, reference_bases, alternate_bases, call.genotype,
29 |         "[{name: 'contig_name', type: 'string'},
30 |           {name: 'start_pos', type: 'integer'},
31 |           {name: 'reference_bases', type: 'string'},
32 |           {name: 'alternate_bases', type: 'string'},
33 |           {name: 'alt_count', type: 'integer'},
34 |           {name: 'ref_count', type: 'integer'},
35 |           {name: 'other_count', type: 'integer'}]",
36 |         "function(r, emit) {
37 |            for(var a in r.alternate_bases) {
38 |              var alt_gt = a + 1;
39 |              var ref_count = 0;
40 |              var alt_count = 0;
41 |              var other_count = 0;
42 |              for(var c in r.call) {
43 |                for(var g in r.call[c].genotype) {
44 |                  if(0 > r.call[c].genotype[g]) {
45 |                    // Don't count no-calls
46 |                    continue;
47 |                  } else if (0 == r.call[c].genotype[g]) {
48 |                    ref_count++;
49 |                  } else if (alt_gt == r.call[c].genotype[g]) {
50 |                    alt_count++;
51 |                  } else {
52 |                    other_count++;
53 |                  }
54 |                }
55 |              }
56 |              // Emit one record per alt
57 |              emit({
58 |                contig_name: r.contig_name,
59 |                start_pos: r.start_pos,
60 |                reference_bases: r.reference_bases,
61 |                alternate_bases: r.alternate_bases[a],
62 |                alt_count: alt_count,
63 |                ref_count: ref_count,
64 |                other_count: other_count
65 |              });
66 |            }
67 |          }"))
68 |   GROUP EACH BY
69 |     contig_name,
70 |     start_pos,
71 |     reference_bases,
72 |     alternate_bases)
73 | 


--------------------------------------------------------------------------------
/pgp/README.md:
--------------------------------------------------------------------------------
 1 | <!-- R Markdown Documentation, DO NOT EDIT THE PLAIN MARKDOWN VERSION OF THIS FILE -->
 2 | 
 3 | <!-- Copyright 2014 Google Inc. All rights reserved. -->
 4 | 
 5 | <!-- Licensed under the Apache License, Version 2.0 (the "License"); -->
 6 | <!-- you may not use this file except in compliance with the License. -->
 7 | <!-- You may obtain a copy of the License at -->
 8 | 
 9 | <!--     http://www.apache.org/licenses/LICENSE-2.0 -->
10 | 
11 | <!-- Unless required by applicable law or agreed to in writing, software -->
12 | <!-- distributed under the License is distributed on an "AS IS" BASIS, -->
13 | <!-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -->
14 | <!-- See the License for the specific language governing permissions and -->
15 | <!-- limitations under the License. -->
16 | 
17 | Personal Genomes Project
18 | =================
19 | 
20 | ### Additional Resources
21 | * [Schema](https://bigquery.cloud.google.com/table/google.com:biggene:pgp.variants?pli=1)
22 | * [Provenance](./provenance)
23 | * [Data Stories](./data-stories) such as
24 |  * [Comparing PGP to 1000 Genomes](./data-stories/comparing-pgp-to-1000genomes)
25 |  * [Issues with the Variant-Centric Approach](./data-stories/issues-with-the-variant-centric-approach)
26 | 
27 | 
28 | 
29 | 
30 | **See [PGP Public data](http://googlegenomics.readthedocs.org/en/latest/use_cases/discover_public_data/pgp_public_data.html#bigquery-pgp-tables) for provenance details of the most recent import of the PGP data which has the up-to-date schema.**  The other tables you see here comprise a variety of schema experiments.  Some of the column names for common data may differ from those of your own variants data exported to BigQuery.
31 | 
32 | Here is an initial query joining the variant data with the phenotypic data.  See the [phenotypes schema](https://bigquery.cloud.google.com/table/google.com:biggene:pgp.phenotypes?pli=1) for more detail.
33 | 
34 | 
35 | ```
36 | # Compute sample count by gender
37 | SELECT
38 |   Sex_Gender,
39 |   COUNT(1) AS cnt
40 | FROM
41 |   (
42 |   SELECT
43 |     call.callset_name,
44 |     Sex_Gender
45 |   FROM
46 |     FLATTEN([google.com:biggene:pgp.variants],
47 |       call) AS var
48 |   JOIN
49 |     [google.com:biggene:pgp.phenotypes] AS pheno
50 |   ON
51 |     pheno.Participant = var.call.callset_name
52 |   GROUP BY
53 |     call.callset_name,
54 |     Sex_Gender)
55 | GROUP BY
56 |   Sex_GenderRunning query:   RUNNING  2.1s
57 | ```
58 | 
59 | <!-- html table generated in R 3.1.2 by xtable 1.7-4 package -->
60 | <!-- Tue Apr 14 07:55:07 2015 -->
61 | <table border=1>
62 | <tr> <th> Sex_Gender </th> <th> cnt </th>  </tr>
63 |   <tr> <td> Female </td> <td align="right">  53 </td> </tr>
64 |   <tr> <td> Male </td> <td align="right"> 112 </td> </tr>
65 |   <tr> <td>  </td> <td align="right">   6 </td> </tr>
66 |    </table>
67 | 
68 | <img src="figure/gender-1.png" title="plot of chunk gender" alt="plot of chunk gender" style="display: block; margin: auto;" />
69 | 


--------------------------------------------------------------------------------
/pgp/provenance/cgi-header-mapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright 2014 Google Inc. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | """Count header values found within CGI files.
18 | 
19 | Assumptions:
20 | - one sample per input file
21 | 
22 | This script can be run standalone:
23 |    cat masterVarBeta-GS000010426-ASM.tsv | ./cgi-header-mapper.py
24 | 
25 | Or via the debugger:
26 |    python -mpdb ./cgi-header-mapper.py masterVarBeta-GS000010426-ASM.tsv
27 | 
28 | It can also be run as a Hadoop Streaming job:
29 |   hadoop jar /path/to/your/hadoop-streaming-*.jar -input inputpath \
30 |   -mapper cgi-header-mapper.py -file cgi-header-mapper.py \
31 |   -reducer aggregate -output outputpath
32 | 
33 | See also https://cloud.google.com/hadoop/
34 | """
35 | 
36 | import os
37 | import re
38 | import sys
39 | 
40 | # Constants
41 | INPUT_FILE_KEY = "map_input_file"
42 | DUPLICATE_GENOME = "gs://pgp-harvard-data-public/hu34D5B9/GS000012763-DID/GS000010327-ASM/GS01173-DNA_C07/ASM/masterVarBeta-GS000010327-ASM.tsv.bz2"
43 | 
44 | 
45 | def generate_long_count_token(value):
46 |   """Formats result for the Hadoop Aggregate package.
47 | 
48 |   For more detail, see
49 |   http://hadoop.apache.org/docs/r1.2.1/streaming.html#Hadoop+Aggregate+Package
50 | 
51 |   Args:
52 |     value: (string) the value to emit
53 | 
54 |   Returns:
55 |     (string) the formatted value
56 |   """
57 |   return "LongValueSum:" + value + "\t" + "1"
58 | 
59 | 
60 | def main():
61 |   """Entry point to the script."""
62 | 
63 |   # Basic parsing of command line arguments to allow a filename
64 |   # to be passed when running this code in the debugger.
65 |   file_handle = sys.stdin
66 |   if 2 <= len(sys.argv):
67 |     path = sys.argv[1]
68 |     file_handle = open(path, "r")
69 |   else:
70 |     path = os.environ[INPUT_FILE_KEY]
71 |     print >> sys.stderr, path
72 |     print >> sys.stderr, str(os.environ)
73 | 
74 |   line = file_handle.readline()
75 |   while line:
76 |     line = line.rstrip("\n")
77 | 
78 |     if DUPLICATE_GENOME == path:
79 |       # hu34D5B9 was sequenced twice, skip the older genome
80 |       pass
81 |     elif not line:
82 |       # This is a blank line, skip it
83 |       pass
84 |     elif "#" == line[0]:
85 |       # This is a header line, count it
86 |       print generate_long_count_token(re.sub("\t", " ", line))
87 | 
88 |     line = file_handle.readline()
89 | 
90 | if __name__ == "__main__":
91 |   main()
92 | 


--------------------------------------------------------------------------------
/pgp/README.Rmd:
--------------------------------------------------------------------------------
 1 | <!-- R Markdown Documentation, DO NOT EDIT THE PLAIN MARKDOWN VERSION OF THIS FILE -->
 2 | 
 3 | <!-- Copyright 2014 Google Inc. All rights reserved. -->
 4 | 
 5 | <!-- Licensed under the Apache License, Version 2.0 (the "License"); -->
 6 | <!-- you may not use this file except in compliance with the License. -->
 7 | <!-- You may obtain a copy of the License at -->
 8 | 
 9 | <!--     http://www.apache.org/licenses/LICENSE-2.0 -->
10 | 
11 | <!-- Unless required by applicable law or agreed to in writing, software -->
12 | <!-- distributed under the License is distributed on an "AS IS" BASIS, -->
13 | <!-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -->
14 | <!-- See the License for the specific language governing permissions and -->
15 | <!-- limitations under the License. -->
16 | 
17 | Personal Genomes Project
18 | =================
19 | 
20 | ### Additional Resources
21 | * [Schema](https://bigquery.cloud.google.com/table/google.com:biggene:pgp.variants?pli=1)
22 | * [Provenance](./provenance)
23 | * [Data Stories](./data-stories) such as
24 |  * [Comparing PGP to 1000 Genomes](./data-stories/comparing-pgp-to-1000genomes)
25 |  * [Issues with the Variant-Centric Approach](./data-stories/issues-with-the-variant-centric-approach)
26 | 
27 | 
28 | ```{r init, echo=FALSE, message=FALSE, warning=FALSE, comment=NA}
29 | require(bigrquery)
30 | require(ggplot2)
31 | require(dplyr)
32 | require(xtable)
33 | require(testthat)
34 | project <- "google.com:biggene" # put your projectID here
35 | DisplayAndDispatchQuery <- function(queryUri) {
36 |   querySql <- readChar(queryUri, nchars=1e6)
37 |   cat(querySql)
38 |   query_exec(querySql, project)
39 | }
40 | ```
41 | 
42 | **See [PGP Public data](http://googlegenomics.readthedocs.org/en/latest/use_cases/discover_public_data/pgp_public_data.html#bigquery-pgp-tables) for provenance details of the most recent import of the PGP data which has the up-to-date schema.**  The other tables you see here comprise a variety of schema experiments.  Some of the column names for common data may differ from those of your own variants data exported to BigQuery.
43 | 
44 | Here is an initial query joining the variant data with the phenotypic data.  See the [phenotypes schema](https://bigquery.cloud.google.com/table/google.com:biggene:pgp.phenotypes?pli=1) for more detail.
45 | 
46 | ```{r echo=FALSE, message=FALSE, warning=FALSE, error=FALSE, comment=NA}
47 | result <- DisplayAndDispatchQuery("./sql/gender-count.sql")
48 | ```
49 | 
50 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"}
51 | print(xtable(head(result)), type="html", include.rownames=F)
52 | ```
53 | 
54 | ```{r gender, echo=FALSE, message=FALSE, warning=FALSE, comment=NA, fig.align="center", fig.width=6, fig.height=4}
55 | result$Sex_Gender[is.na(result$Sex_Gender)] <- "Unknown"
56 | ggplot(result, aes(x="", y=cnt, fill=Sex_Gender)) +
57 |   geom_bar(width=1, stat="identity") +
58 |   coord_polar("y", start=pi / 3) +
59 |   xlab("") + ylab("Gender Count")
60 | ```
61 | 


--------------------------------------------------------------------------------
/1000genomes/data-stories/reproducing-vcfstats/vcfstats-output/stats.private:
--------------------------------------------------------------------------------
  1 | #Private SNPs	Sample
  2 | 1	HG00106
  3 | 1	HG00109
  4 | 1	HG00143
  5 | 3	HG00152
  6 | 1	HG00160
  7 | 1	HG00186
  8 | 1	HG00231
  9 | 1	HG00234
 10 | 2	HG00235
 11 | 1	HG00236
 12 | 2	HG00237
 13 | 1	HG00244
 14 | 1	HG00246
 15 | 1	HG00247
 16 | 1	HG00249
 17 | 2	HG00329
 18 | 1	HG00342
 19 | 1	HG00355
 20 | 1	HG00367
 21 | 2	HG00384
 22 | 1	HG00422
 23 | 1	HG00442
 24 | 1	HG00452
 25 | 1	HG00475
 26 | 1	HG00534
 27 | 1	HG00556
 28 | 1	HG00560
 29 | 2	HG00611
 30 | 1	HG00641
 31 | 1	HG00654
 32 | 1	HG00656
 33 | 1	HG00671
 34 | 1	HG00693
 35 | 1	HG00699
 36 | 1	HG00701
 37 | 1	HG00707
 38 | 1	HG00708
 39 | 3	HG00737
 40 | 1	HG00740
 41 | 1	HG01048
 42 | 1	HG01060
 43 | 1	HG01069
 44 | 2	HG01108
 45 | 2	HG01124
 46 | 1	HG01148
 47 | 1	HG01149
 48 | 1	HG01171
 49 | 1	HG01191
 50 | 1	HG01272
 51 | 1	HG01356
 52 | 1	HG01375
 53 | 1	HG01378
 54 | 3	HG01390
 55 | 1	HG01456
 56 | 1	HG01462
 57 | 1	HG01465
 58 | 1	HG01488
 59 | 1	HG01489
 60 | 1	HG01491
 61 | 1	HG01495
 62 | 8	HG01551
 63 | 1	HG01624
 64 | 1	NA07051
 65 | 1	NA12342
 66 | 2	NA12383
 67 | 2	NA12400
 68 | 1	NA18507
 69 | 2	NA18510
 70 | 1	NA18519
 71 | 1	NA18523
 72 | 1	NA18527
 73 | 1	NA18528
 74 | 1	NA18532
 75 | 1	NA18534
 76 | 2	NA18535
 77 | 1	NA18536
 78 | 1	NA18539
 79 | 1	NA18548
 80 | 1	NA18557
 81 | 2	NA18562
 82 | 2	NA18573
 83 | 1	NA18596
 84 | 1	NA18605
 85 | 1	NA18606
 86 | 3	NA18616
 87 | 1	NA18622
 88 | 1	NA18628
 89 | 1	NA18631
 90 | 1	NA18634
 91 | 1	NA18638
 92 | 1	NA18641
 93 | 1	NA18856
 94 | 3	NA18868
 95 | 1	NA18907
 96 | 1	NA18924
 97 | 2	NA18939
 98 | 3	NA18956
 99 | 1	NA18957
100 | 2	NA18962
101 | 1	NA18976
102 | 1	NA18990
103 | 1	NA18992
104 | 1	NA18995
105 | 1	NA19002
106 | 1	NA19005
107 | 1	NA19020
108 | 1	NA19046
109 | 1	NA19056
110 | 1	NA19059
111 | 2	NA19063
112 | 1	NA19068
113 | 1	NA19074
114 | 2	NA19077
115 | 1	NA19084
116 | 2	NA19087
117 | 1	NA19093
118 | 14	NA19096
119 | 1	NA19099
120 | 1	NA19131
121 | 1	NA19147
122 | 1	NA19149
123 | 1	NA19150
124 | 1	NA19197
125 | 1	NA19236
126 | 1	NA19248
127 | 1	NA19316
128 | 1	NA19318
129 | 1	NA19319
130 | 1	NA19324
131 | 1	NA19332
132 | 2	NA19346
133 | 1	NA19351
134 | 1	NA19360
135 | 1	NA19372
136 | 1	NA19395
137 | 1	NA19398
138 | 1	NA19401
139 | 1	NA19437
140 | 1	NA19439
141 | 1	NA19440
142 | 1	NA19457
143 | 1	NA19463
144 | 1	NA19467
145 | 1	NA19474
146 | 1	NA19661
147 | 1	NA19701
148 | 2	NA19704
149 | 1	NA19716
150 | 1	NA19717
151 | 1	NA19719
152 | 1	NA19734
153 | 1	NA19740
154 | 1	NA19749
155 | 1	NA19752
156 | 1	NA19755
157 | 1	NA19758
158 | 1	NA19761
159 | 1	NA19762
160 | 1	NA19774
161 | 2	NA19780
162 | 1	NA19782
163 | 1	NA19819
164 | 1	NA19901
165 | 1	NA19904
166 | 1	NA19921
167 | 1	NA20294
168 | 1	NA20296
169 | 2	NA20322
170 | 1	NA20342
171 | 1	NA20344
172 | 1	NA20351
173 | 1	NA20505
174 | 1	NA20506
175 | 2	NA20521
176 | 1	NA20581
177 | 1	NA20582
178 | 1	NA20589
179 | 1	NA20756
180 | 1	NA20760
181 | 1	NA20768
182 | 1	NA20792
183 | 1	NA20796
184 | 1	NA20803
185 | 1	NA20805
186 | 1	NA20809
187 | 2	NA20819
188 | 1	NA20826
189 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
 1 | How to contribute
 2 | ===================================
 3 | 
 4 | First of all, thank you for contributing!
 5 | 
 6 | The mailing list
 7 | ----------------
 8 | 
 9 | For general questions or if you are having trouble getting started, try the 
10 | `Google Genomics Discuss mailing list <https://groups.google.com/forum/#!forum/google-genomics-discuss>`_. 
11 | It's a good way to sync up with other people who use googlegenomics including the core developers. You can subscribe
12 | by sending an email to ``google-genomics-discuss+subscribe@googlegroups.com`` or just post using
13 | the `web forum page <https://groups.google.com/forum/#!forum/google-genomics-discuss>`_.
14 | 
15 | 
16 | Submitting issues
17 | -----------------
18 | 
19 | If you are encountering a bug in the code or have a feature request in mind - file away! 
20 | 
21 | 
22 | Submitting a pull request
23 | -------------------------
24 | 
25 | If you are ready to contribute code, Github provides a nice `overview on how to create a pull request
26 | <https://help.github.com/articles/creating-a-pull-request>`_.
27 | 
28 | Some general rules to follow:
29 | 
30 | * Do your work in `a fork <https://help.github.com/articles/fork-a-repo>`_ of this repo.
31 | * Create a branch for each update that you're working on. 
32 |   These branches are often called "feature" or "topic" branches. Any changes
33 |   that you push to your feature branch will automatically be shown in the pull request.
34 | * Keep your pull requests as small as possible. Large pull requests are hard to review. 
35 |   Try to break up your changes into self-contained and incremental pull requests.
36 | * The first line of commit messages should be a short (<80 character) summary, 
37 |   followed by an empty line and then any details that you want to share about the commit.
38 | * Please try to follow the existing syntax style
39 | 
40 | When you submit or change your pull request, the Travis build system will automatically run tests. 
41 | If your pull request fails to pass tests, review the test log, make changes and
42 | then push them to your feature branch to be tested again.
43 | 
44 | 
45 | Contributor License Agreements
46 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
47 | 
48 | All pull requests are welcome. Before we can submit them though, there is a legal hurdle we have to jump. 
49 | You'll need to fill out either the individual or corporate Contributor License Agreement
50 | (CLA).
51 | 
52 | * If you are an individual writing original source code and you're sure you
53 |   own the intellectual property, then you'll need to sign an `individual CLA
54 |   <https://developers.google.com/open-source/cla/individual>`_.
55 | * If you work for a company that wants to allow you to contribute your work,
56 |   then you'll need to sign a `corporate CLA
57 |   <https://developers.google.com/open-source/cla/corporate>`_.
58 | 
59 | Follow either of the two links above to access the appropriate CLA and
60 | instructions for how to sign and return it. Once we receive it, we'll be able to
61 | accept your pull requests.
62 | 


--------------------------------------------------------------------------------
/pgp/data-stories/schema-comparisons/schema-comparison-observations.csv:
--------------------------------------------------------------------------------
 1 | tables,table_size,description,code,runtime,data_processed,notes
 2 | cgi_variants,433GB,sample-level data for a particular variant,klotho.sql,4.1s elapsed,117 GB processed,
 3 | gvcf_variants,235GB,sample-level data for a particular variant,klotho.sql,6.9s elapsed,76.8 GB processed,
 4 | gvcf_variants_expanded,506GB,sample-level data for a particular variant,klotho.sql,7.7s elapsed,196 GB processed,
 5 | cgi_variants,433GB,per sample Ti/Tv ratio,ti-tv-ratio.sql,3.8s elapsed,53.7 GB processed,
 6 | gvcf_variants,235GB,per sample Ti/Tv ratio,ti-tv-ratio.sql,29.4s elapsed,59.8 GB processed,
 7 | gvcf_variants_expanded,506GB,per sample Ti/Tv ratio,ti-tv-ratio.sql,83.5s elapsed,185 GB processed,
 8 | cgi_variants,433GB,allelic frequency on a small region of the genome,allelic-frequency-brca1.sql,50.3s elapsed,117 GB processed,
 9 | gvcf_variants,235GB,allelic frequency on a small region of the genome,allelic-frequency-brca1.sql,15.7s elapsed,53.9 GB processed,
10 | gvcf_variants_expanded,506GB,allelic frequency on a small region of the genome,NA,NA,NA,"the pattern is correct but the result will be wrong until records for ""the same"" variant are merged together"
11 | cgi_variants,433GB,"allele counts for the full dataset, as step one out of two to compute allelic frequency for the full dataset",allele-count.sql,"70.2s elapsed
12 | ","88.8 GB processed
13 | 
14 | ",result materialized to table google.com:biggene:pgp_analysis_results.cgi_variants_allele_counts
15 | gvcf_variants,235GB,"allele counts for the full dataset, as step one out of two to compute allelic frequency for the full dataset",allele-count.sql,51.6s elapsed,44.1 GB processed,result materialized to table google.com:biggene:pgp_analysis_results.gvcf_variants_allele_counts
16 | gvcf_variants_expanded,506GB,"allele counts for the full dataset, as step one out of two to compute allelic frequency for the full dataset",NA,NA,NA,"not necessary, the data encoding allows us to do allelic frequency in a single step"
17 | cgi_variants,433GB,allelic frequency as step two of two,allelic-frequency-chr1.sql,118.4s elapsed,90.4 GB processed,results for all chromosomes materialized to table google.com:biggene:pgp_analysis_results.cgi_variants_allelic_frequency
18 | gvcf_variants,235GB,allelic frequency as step two of two,allelic-frequency-chr1.sql,96.4s elapsed,55.2 GB processed,results for all chromosomes materialized to table google.com:biggene:pgp_analysis_results.gvcf_variants_allelic_frequency
19 | gvcf_variants_expanded,506GB,allelic frequency,allelic-frequency.sql,318.7s elapsed,121 GB processed,"the pattern is correct but the result will be wrong until records for ""the same"" variant are merged together"
20 | cgi_variants,433GB,"allelic frequency compared to 1,000 genomes",allelic-frequency-comparison.sql,20.7s elapsed,2.93 GB processed,
21 | gvcf_variants,235GB,"allelic frequency compared to 1,000 genomes",allelic-frequency-comparison.sql,12.5s elapsed,2.72 GB processed,
22 | gvcf_variants_expanded,506GB,"allelic frequency compared to 1,000 genomes",NA,NA,NA,"the pattern is correct but the result will be wrong until records for ""the same"" variant are merged together"


--------------------------------------------------------------------------------
/1000genomes/provenance/README.md:
--------------------------------------------------------------------------------
 1 | Provenance
 2 | ========================================================
 3 | 
 4 | Source Variant Data
 5 | ------------------------------
 6 | 
 7 | ### variants table
 8 | 
 9 | See [Google Genomics Public Data](http://googlegenomics.readthedocs.io/en/latest/use_cases/discover_public_data/1000_genomes.html) for provenance details for this data.
10 | 
11 | Source Sample Information
12 | --------------------------------
13 | [Ethnicity, gender, and family relationship](http://www.1000genomes.org/faq/can-i-get-phenotype-gender-and-family-relationship-information-samples) information is available for the 1,000 Genomes dataset.  Super population groupings are described in the [FAQ](http://www.1000genomes.org/category/frequently-asked-questions/population).
14 | 
15 | Note: information for sample NA12236 is present in the pedigree table but not sample_info table.  Also sample NA12236 is not a member of the samples within table variants1kg.
16 | 
17 | ### sample_info table
18 | 
19 | Description: 
20 | * http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/README_20130606_sample_info
21 | * http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/README.populations
22 | * [BigQuery table](https://bigquery.cloud.google.com/table/genomics-public-data:1000_genomes.sample_info?pli=1)
23 | 
24 | Source: 
25 | * http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/20130606_sample_info.txt 
26 | * http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/20131219.populations.tsv
27 | * http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/20131219.superpopulations.tsv
28 | 
29 | Status: 
30 | * complete, see script [sample-info-prep.R](./sample-info-prep.R) to see how the data was cleaned and transformed prior to the upload to BigQuery
31 | 
32 | To load the script output via the [bq command line tool](https://cloud.google.com/bigquery/bq-command-line-tool#creatingtablefromfile), run:
33 | ```
34 | bq load --project_id <YOUR-PROJECT_ID> --source_format=CSV \
35 | --skip_leading_rows=1 <YOUR_DATASET.YOUR_TABLE> \
36 | gs://genomics-public-data/1000-genomes/other/sample_info/sample_info.csv \
37 | gs://genomics-public-data/1000-genomes/other/sample_info/sample_info.schema
38 | ```
39 | 
40 | ### pedigree table
41 | 
42 | Description: 
43 | * http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/README_20130606_sample_info
44 | * [BigQuery table](https://bigquery.cloud.google.com/table/genomics-public-data:1000_genomes.pedigree?pli=1)
45 | 
46 | Source:  
47 | * http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/20130606_g1k.ped
48 | 
49 | Status: 
50 | * complete, no cleaning or transformation needed
51 | 
52 | To load the source file via the [bq command line tool](https://cloud.google.com/bigquery/bq-command-line-tool#creatingtablefromfile), download it to your local system and run:
53 | ```
54 | bq load --project_id <YOUR-PROJECT_ID> --source_format=CSV \
55 | --field_delimiter=tab --skip_leading_rows=1 <YOUR_DATASET.YOUR_TABLE> \
56 | ./20130606_g1k.ped \
57 | Family_ID:STRING,Individual_ID:STRING,Paternal_ID:STRING,Maternal_ID:STRING,Gender:INTEGER,Phenotype:INTEGER,Population:STRING,Relationship:STRING,Siblings:STRING,Second_Order:STRING,Third_Order:STRING,Other_Comments:STRING
58 | ```
59 | 


--------------------------------------------------------------------------------
/pgp/sql/cgi_variants/allelic-frequency-chr1.sql:
--------------------------------------------------------------------------------
 1 | # Compute allelic frequency for chromosome 1 by counting the number of called
 2 | # alleles (reference-calls and variant-calls, but leave out no-calls) that 
 3 | # overlap each variant allele for which we previously counted its occurence
 4 | # in this dataset.  This returns a large result which should be materialized to 
 5 | # a table.
 6 | #
 7 | # Note that the new BigQuery feature of user-defined javascript
 8 | # functions is in limited preview.  For more info, see
 9 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377
10 | SELECT
11 |   vars.chromosome AS chromosome,
12 |   vars.reference AS reference,
13 |   vars.locusBegin AS locusBegin,
14 |   vars.locusEnd AS locusEnd,
15 |   vars.allele AS allele,
16 |   alternate_allele_count,
17 |   num_alleles_called,
18 |   ROUND(alternate_allele_count / num_alleles_called,
19 |     4) AS freq,
20 | FROM (
21 |   SELECT
22 |     vars.chromosome,
23 |     vars.reference,
24 |     vars.locusBegin,
25 |     vars.locusEnd,
26 |     vars.allele,
27 |     alternate_allele_count,
28 |     SUM(num_alleles_called) AS num_alleles_called,
29 |   FROM (
30 |     # The left hand side of our JOIN is are all the calls, including
31 |     # reference calls (but not no-calls)
32 |     SELECT
33 |         SUM(num_alleles_called) AS num_alleles_called,
34 |         chromosome,
35 |         reference,
36 |         bin,
37 |         locusBegin,
38 |         locusEnd
39 |       # This User-defined function helps us reduce the size of the cross product
40 |       # considered by this JOIN thereby greatly speeding up the query
41 |       FROM js(
42 |       (SELECT chromosome, reference, locusBegin, locusEnd, allele1Seq, allele2Seq,
43 |        FROM [google.com:biggene:pgp.cgi_variants]
44 |        WHERE chromosome = 'chr1'),
45 |       chromosome, reference, locusBegin, locusEnd, allele1Seq, allele2Seq,
46 |       "[{name: 'num_alleles_called', type: 'integer'},
47 |         {name: 'chromosome', type: 'string'},
48 |         {name: 'reference', type: 'string'},
49 |         {name: 'bin', type: 'integer'},
50 |         {name: 'locusBegin', type: 'integer'},
51 |         {name: 'locusEnd', type: 'integer'}]",
52 |        "function(r, emit) {
53 |             var num_alleles_called = 0;
54 |             if('?' != r.allele1Seq) { num_alleles_called++; }
55 |             if('?' != r.allele2Seq) { num_alleles_called++; }
56 |             var binSize = 5000
57 |             var startBin = Math.floor(r.locusBegin / binSize);
58 |             var endBin = Math.floor(r.locusEnd / binSize);
59 |             for(var bin = startBin; bin <= endBin; bin++) {
60 |               emit({
61 |                 num_alleles_called: num_alleles_called,
62 |                 chromosome: r.chromosome,
63 |                 reference: r.reference,
64 |                 bin: bin,
65 |                 locusBegin: r.locusBegin,
66 |                 locusEnd: r.locusEnd,
67 |               });
68 |             }
69 |         }")
70 |         GROUP EACH BY
71 |         chromosome,
72 |         reference,
73 |         bin,
74 |         locusBegin,
75 |         locusEnd
76 |         ) AS all
77 |   JOIN
78 |     EACH 
79 |     # The right hand side of our JOIN are counts of alternate allele values at 
80 |     # a particular locus
81 |     [google.com:biggene:pgp_analysis_results.cgi_variants_allele_counts] AS vars
82 |   ON
83 |     vars.chromosome = all.chromosome
84 |     AND vars.bin = all.bin
85 |   WHERE
86 |     # Further constrain the JOIN to calls that overlapped the first base pair
87 |     # of this variant
88 |     all.locusBegin <= vars.locusBegin
89 |     AND all.locusEnd >= vars.locusBegin+1
90 |   GROUP EACH BY
91 |     vars.chromosome,
92 |     vars.reference,
93 |     vars.locusBegin,
94 |     vars.locusEnd,
95 |     vars.allele,
96 |     alternate_allele_count
97 |     )
98 | 


--------------------------------------------------------------------------------
/pgp/provenance/cgi-mapper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright 2014 Google Inc. All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | """Add sample id as column to CGI data.
 18 | 
 19 | Assumptions:
 20 | - one sample per input file
 21 | 
 22 | This script can be run standalone:
 23 |    cat masterVarBeta-GS000010426-ASM.tsv | ./cgi-mapper.py
 24 | 
 25 | Or via the debugger:
 26 |    python -mpdb ./cgi-mapper.py masterVarBeta-GS000010426-ASM.tsv
 27 | 
 28 | To have the sample id correctly parsed when input is from stdin, set the 
 29 | environment variable that Hadoop would set:
 30 |    export map_input_file=./hu34D5B9/masterVarBeta-GS000015891-ASM.tsv.bz2
 31 |    bzcat ./hu34D5B9/masterVarBeta-GS000015891-ASM.tsv.bz2 | ./cgi-mapper.py
 32 | 
 33 | To have the sample id correctly parsed when input is from a file, ensure that it
 34 | is in the file path:
 35 |    python -mpdb ./cgi-mapper.py hu34D5B9/masterVarBeta-GS000015891-ASM.tsv
 36 | 
 37 | It can also be run as a mapper-only Hadoop Streaming job:
 38 |   hadoop jar /path/to/your/hadoop-streaming-*.jar -input inputpath \
 39 |   -mapper cgi-mapper.py -file cgi-mapper.py --numReduceTasks 0 \
 40 |   -output outputpath
 41 | See also https://cloud.google.com/hadoop/
 42 | 
 43 | TODO(deflaux):
 44 |  - field relativeCoverageDiploid contains some values that are 'N', consider
 45 |    converting those values to null
 46 |  - consider converting zero-based positions to one-based positions if we
 47 |    find that most annotations are one-based
 48 | 
 49 | """
 50 | 
 51 | import os
 52 | import re
 53 | import sys
 54 | 
 55 | # Constants
 56 | INPUT_FILE_KEY = "map_input_file"
 57 | SAMPLE_ID_PATTERN = "/(hu[A-F0-9]{6})/"
 58 | DUPLICATE_GENOME = "gs://pgp-harvard-data-public/hu34D5B9/GS000012763-DID/GS000010327-ASM/GS01173-DNA_C07/ASM/masterVarBeta-GS000010327-ASM.tsv.bz2"
 59 | 
 60 | 
 61 | def main():
 62 |   """Entry point to the script."""
 63 | 
 64 |   sample_id = None
 65 |   sample_id_re = re.compile(SAMPLE_ID_PATTERN)
 66 | 
 67 |   # Basic parsing of command line arguments to allow a filename
 68 |   # to be passed when running this code in the debugger.
 69 |   path = None
 70 |   file_handle = sys.stdin
 71 |   if 2 <= len(sys.argv):
 72 |     path = sys.argv[1]
 73 |     file_handle = open(path, "r")
 74 |   elif INPUT_FILE_KEY in os.environ:
 75 |     path = os.environ[INPUT_FILE_KEY]
 76 |     print >> sys.stderr, path
 77 |     print >> sys.stderr, str(os.environ)
 78 |   
 79 |   if path is not None:
 80 |     match = sample_id_re.search(path)
 81 |     if match:
 82 |       sample_id = match.group(1)
 83 | 
 84 |   line = file_handle.readline()
 85 |   while line:
 86 |     line = line.rstrip("\n")
 87 | 
 88 |     if DUPLICATE_GENOME == path:
 89 |       # hu34D5B9 was sequenced twice, skip the older genome
 90 |       pass
 91 |     elif not line:
 92 |       # This is a blank line, skip it
 93 |       pass
 94 |     elif "#" == line[0]:
 95 |       # This is a header line, skip it
 96 |       pass
 97 |     elif ">" == line[0]:
 98 |       # This is the column header line, skip it
 99 |       pass
100 |     else:
101 |       fields = line.split("\t")
102 |       print "%s\t%s" % (sample_id, "\t".join(fields))
103 | 
104 |     line = file_handle.readline()
105 | 
106 | if __name__ == "__main__":
107 |   main()
108 | 


--------------------------------------------------------------------------------
/1000genomes/sql/gwas-pattern-two-proportion-z-test.sql:
--------------------------------------------------------------------------------
  1 | # An example of a pattern one might use for GWAS queries upon 1,000
  2 | # Genomes variants.  It is specifically examining differences allelic
  3 | # frequency for variants upon chromosome 12 between the EAS super
  4 | # population versus all other individuals, returning a ranked list of
  5 | # variants by decreasing variation between groups.  Note that this
  6 | # particular query below is naive in many, many respects and is merely
  7 | # meant as an over-simplified example that might help domain experts
  8 | # translate their scientifically correct data filtering and
  9 | # statistical methods to BigQuery.  Feedback to improve this query is
 10 | # most welcome!
 11 | 
 12 | # http://www.statisticslectures.com/topics/ztestproportions/
 13 | # two-proportion z-test
 14 | # z-score critical value for p-value=5*10^-8 is +/-5.45131
 15 | #
 16 | #   > qnorm(1 - ((5e-8)/2), lower.tail=T)
 17 | #   [1] 5.45131
 18 | #   > qnorm(1 - ((5e-8)/2), lower.tail=F)
 19 | #   [1] -5.45131
 20 | 
 21 | SELECT
 22 |   reference_name,
 23 |   start,
 24 |   END,
 25 |   reference_bases,
 26 |   alternate_bases,
 27 |   vt,
 28 |   case_count,
 29 |   control_count,
 30 |   allele_count,
 31 |   ref_count,
 32 |   alt_count,
 33 |   case_ref_count,
 34 |   case_alt_count,
 35 |   control_ref_count,
 36 |   control_alt_count,
 37 |   ROUND(
 38 |     (case_alt_count/case_count - control_alt_count/control_count)
 39 |     /
 40 |     SQRT(
 41 |       ((((case_alt_count+control_alt_count)/allele_count) *
 42 |           ((case_ref_count+control_ref_count)/allele_count))
 43 |         / case_count
 44 |         )
 45 |       +
 46 |       ((((case_alt_count+control_alt_count)/allele_count) *
 47 |           ((case_ref_count+control_ref_count)/allele_count))
 48 |         / control_count
 49 |         )
 50 |       )
 51 |     ,
 52 |     3)
 53 |   AS z_score
 54 | FROM (
 55 |   SELECT
 56 |     reference_name,
 57 |     start,
 58 |     end,
 59 |     reference_bases,
 60 |     alternate_bases,
 61 |     vt,
 62 |     SUM(ref_count + alt_count) AS allele_count,
 63 |     SUM(ref_count) AS ref_count,
 64 |     SUM(alt_count) AS alt_count,
 65 |     SUM(IF(TRUE = is_case, INTEGER(ref_count + alt_count), 0)) AS case_count,
 66 |     SUM(IF(FALSE = is_case, INTEGER(ref_count + alt_count), 0)) AS control_count,
 67 |     SUM(IF(TRUE = is_case, ref_count, 0)) AS case_ref_count,
 68 |     SUM(IF(TRUE = is_case, alt_count, 0)) AS case_alt_count,
 69 |     SUM(IF(FALSE = is_case, ref_count, 0)) AS control_ref_count,
 70 |     SUM(IF(FALSE = is_case, alt_count, 0)) AS control_alt_count,
 71 |   FROM (
 72 |     SELECT
 73 |       reference_name,
 74 |       start,
 75 |       ('EAS' = super_population) AS is_case,
 76 |       reference_bases,
 77 |       alternate_bases,
 78 |       END,
 79 |       vt,
 80 |       # 1000 genomes phase 1 data is bi-allelic so there is only ever a single alt
 81 |       SUM(0 = call.genotype) WITHIN RECORD AS ref_count,
 82 |       SUM(1 = call.genotype) WITHIN RECORD AS alt_count,
 83 |     FROM
 84 |       FLATTEN((
 85 |         SELECT
 86 |           reference_name,
 87 |           start,
 88 |           reference_bases,
 89 |           alternate_bases,
 90 |           END,
 91 |           vt,
 92 |           call.call_set_name,
 93 |           call.genotype,
 94 |         FROM
 95 |           [genomics-public-data:1000_genomes.variants]
 96 |         WHERE
 97 |           reference_name = '12'
 98 |           ),
 99 |         call) AS g
100 |     JOIN
101 |       [genomics-public-data:1000_genomes.sample_info] p
102 |     ON
103 |       g.call.call_set_name = p.sample
104 |       )
105 |   GROUP BY
106 |     reference_name,
107 |     start,
108 |     end,
109 |     reference_bases,
110 |     alternate_bases,
111 |     vt)
112 | HAVING
113 |   z_score >= 5.45131
114 |   OR z_score <= -5.45131
115 | ORDER BY
116 |   z_score DESC,
117 |   allele_count DESC
118 | 


--------------------------------------------------------------------------------
/platinumGenomes/README.Rmd:
--------------------------------------------------------------------------------
 1 | Platinum Genomes
 2 | ================
 3 | 
 4 | ### Additional Resources
 5 | 
 6 | There are just a handful of queries below but you will find a whole suite of
 7 | queries for the Platinum Genome dataset written as a codelab for performing
 8 | [Quality Control on Variants](https://github.com/googlegenomics/codelabs/tree/master/R/PlatinumGenomes-QC).
 9 | 
10 | * [variants table](https://bigquery.cloud.google.com/table/genomics-public-data:platinum_genomes.variants?pli=1)
11 | * [sample_info table](https://bigquery.cloud.google.com/table/google.com:biggene:platinum_genomes.sample_info)
12 | * See [Google Genomics Public Data](http://googlegenomics.readthedocs.io/en/latest/use_cases/discover_public_data/platinum_genomes.html)
13 | for provenance details for this data.
14 | 
15 | ```{r echo=FALSE, eval=FALSE}
16 | ######################[ CHANGE ME ]##################################
17 | # This codelab assumes that the current working directory is where the Rmd file resides.
18 | setwd("/YOUR/PATH/TO/bigquery-examples/platinumGenomes")
19 | 
20 | # Set the Google Cloud Platform project id under which these queries will run.
21 | project <- "YOUR-PROJECT-ID"
22 | #####################################################################
23 | 
24 | ### Install the bigrquery package.  The currently released version 0.3.0 does not yet
25 | ### have the parameter to use Standard SQL instead of Legacy SQL, so we install from github.
26 | library(devtools)
27 | install_github('rstats-db/bigrquery')
28 | ```
29 | 
30 | ```{r init, echo=FALSE, message=FALSE, warning=FALSE, comment=NA}
31 | library(bigrquery)
32 | library(ggplot2)
33 | library(scales)
34 | library(dplyr)
35 | library(testthat)
36 | DisplayAndDispatchQuery <- function(queryUri) {
37 |   querySql <- readChar(queryUri, nchars=1e6)
38 |   cat(querySql)
39 |   query_exec(querySql, project, use_legacy_sql = FALSE)
40 | }
41 | ```
42 | 
43 | ### SNP Annotation
44 | 
45 | Let's annotate variants in the [Illumina Platinum Genomes dataset](http://googlegenomics.readthedocs.io/en/latest/use_cases/discover_public_data/platinum_genomes.html)
46 | using Tute Genomics' table of annotations for hg19 SNPs.  Please see [Google Genomics Public Data](http://googlegenomics.readthedocs.io/en/latest/use_cases/discover_public_data/tute_genomics_public_data.html)
47 | for more detail about these annotations.
48 | 
49 | First we'll count variants by exonic functional impact:
50 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, cache=FALSE}
51 | result <- DisplayAndDispatchQuery("./sql/sample-snps-by-exonic-function.sql")
52 | ```
53 | 
54 | Results:
55 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"}
56 | knitr::kable(head(result), digits=6)
57 | ```
58 | 
59 | Visualized:
60 | ```{r function, echo=FALSE, message=FALSE, warning=FALSE, comment=NA, fig.align="center", fig.width=12, fig.height=8}
61 | ggplot(result, aes(y=variant_count, x=ExonicFunc)) +
62 |   geom_boxplot() +
63 |   scale_y_log10(labels=comma) +
64 |   ylab("Number of variants (log scale)") +
65 |   xlab("Exonic Function") +
66 |   ggtitle("Functional impact of Platinum Genomes SNPs") +
67 |   theme(axis.text.x=element_text(angle=50, hjust=1))
68 | ```
69 | 
70 | Next we'll identify rare variants across the cohort indicated as pathenogenic
71 | by [ClinVar](https://www.ncbi.nlm.nih.gov/clinvar/):
72 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, cache=FALSE}
73 | result <- DisplayAndDispatchQuery("./sql/cohort-rare-pathenogenic-snps.sql")
74 | ```
75 | 
76 | Results:
77 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"}
78 | knitr::kable(result, digits=6)
79 | ```
80 | 
81 | And finally we'll re-run this analysis using only the variants for one specific individual:
82 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, cache=FALSE}
83 | result <- DisplayAndDispatchQuery("./sql/sample-rare-pathenogenic-snps.sql")
84 | ```
85 | 
86 | Results:
87 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"}
88 | knitr::kable(result, digits=6)
89 | ```
90 | 


--------------------------------------------------------------------------------
/pgp/sql/gvcf_variants/allelic-frequency-chr1.sql:
--------------------------------------------------------------------------------
  1 | # Compute allelic frequency for chromosome 1 by counting the number of called
  2 | # alleles (reference-calls and variant-calls, but leave out no-calls) that 
  3 | # overlap each variant allele for which we previously counted its occurence
  4 | # in this dataset.  This returns a large result which should be materialized to 
  5 | # a table.
  6 | #
  7 | # Note that the new BigQuery feature of user-defined javascript
  8 | # functions is in limited preview.  For more info, see
  9 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377
 10 | SELECT
 11 |   vars.contig_name AS contig_name,
 12 |   vars.reference_bases AS reference_bases,
 13 |   vars.start_pos AS start_pos,
 14 |   vars.alternate_bases AS allele,
 15 |   alternate_allele_count,
 16 |   num_alleles_called,
 17 |   ROUND(alternate_allele_count / num_alleles_called,
 18 |     4) AS freq,
 19 | FROM (
 20 |   SELECT
 21 |     vars.contig_name,
 22 |     vars.reference_bases,
 23 |     vars.start_pos,
 24 |     vars.alternate_bases,
 25 |     alternate_allele_count,
 26 |     SUM(num_alleles_called) AS num_alleles_called,
 27 |   FROM (
 28 |     # The left hand side of our JOIN is are all the calls, including
 29 |     # reference_bases calls and no-calls
 30 |     SELECT
 31 |         SUM(num_alleles_called) AS num_alleles_called,
 32 |         contig_name,
 33 |         reference_bases,
 34 |         bin,
 35 |         start_pos,
 36 |         the_end,
 37 |       # This User-defined function helps us reduce the size of the cross product
 38 |       # considered by this JOIN thereby greatly speeding up the query
 39 |       FROM js(
 40 |       (SELECT contig_name, reference_bases, start_pos, end_pos, END, call.genotype,
 41 |        FROM [google.com:biggene:pgp.gvcf_variants]
 42 |        WHERE contig_name = '1'),
 43 |       contig_name, reference_bases, start_pos, end_pos, END, call.genotype,
 44 |       "[{name: 'num_alleles_called', type: 'integer'},
 45 |         {name: 'contig_name', type: 'string'},
 46 |         {name: 'reference_bases', type: 'string'},
 47 |         {name: 'bin', type: 'integer'},
 48 |         {name: 'start_pos', type: 'integer'},
 49 |         {name: 'the_end', type: 'integer'}]",
 50 |        "function(r, emit) {
 51 |             var num_alleles_called = 0;
 52 |             for(var c in r.call) {
 53 |               for(var g in r.call[c].genotype) {
 54 |                 if(0 <= r.call[c].genotype[g]) {
 55 |                   num_alleles_called++;
 56 |                 }
 57 |               }
 58 |             }
 59 |             var binSize = 5000
 60 |             var startBin = Math.floor(r.start_pos / binSize);
 61 |             var theEnd = (r.END === null) ? r.end_pos : r.END;
 62 |             var endBin = Math.floor(theEnd / binSize);
 63 |             for(var bin = startBin; bin <= endBin; bin++) {
 64 |               emit({
 65 |                 num_alleles_called: num_alleles_called,
 66 |                 contig_name: r.contig_name,
 67 |                 reference_bases: r.reference_bases,
 68 |                 bin: bin,
 69 |                 start_pos: r.start_pos,
 70 |                 the_end: theEnd
 71 |               });
 72 |             }
 73 |         }")
 74 |         GROUP EACH BY
 75 |         contig_name,
 76 |         reference_bases,
 77 |         bin,
 78 |         start_pos,
 79 |         the_end
 80 |         ) AS all
 81 |   JOIN
 82 |     EACH 
 83 |     # The right hand side of our JOIN are counts of alternate allele values at 
 84 |     # a particular locus
 85 |     [google.com:biggene:pgp_analysis_results.gvcf_variants_allele_counts] AS vars
 86 |   ON
 87 |     vars.contig_name = all.contig_name
 88 |     AND vars.bin = all.bin
 89 |   WHERE
 90 |     # Further constrain the JOIN to calls that overlapped the first base pair
 91 |     # of this variant
 92 |     all.start_pos <= vars.start_pos
 93 |     AND all.the_end >= vars.start_pos+1
 94 |   GROUP EACH BY
 95 |     vars.contig_name,
 96 |     vars.reference_bases,
 97 |     vars.start_pos,
 98 |     vars.alternate_bases,
 99 |     alternate_allele_count
100 |     )
101 | 


--------------------------------------------------------------------------------
/sgdp/provenance/wrangle-simons-sample-attributes.R:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Wrangle sample attributes for Simons Genome Diversity Project Data.  See also:
16 | # https://www.simonsfoundation.org/life-sciences/simons-genome-diversity-project-dataset/
17 | # http://googlegenomics.readthedocs.io/en/latest/use_cases/discover_public_data/simons_foundation.html
18 | 
19 | library(testthat)
20 | library(XML)
21 | library(reshape2)
22 | library(dplyr)
23 | library(stringr)
24 | 
25 | study <- read.delim("https://www.ebi.ac.uk/ena/data/warehouse/filereport?accession=PRJEB9586&result=read_run&fields=study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,tax_id,scientific_name,instrument_model,library_layout,fastq_ftp,fastq_galaxy,submitted_ftp,submitted_galaxy,sra_ftp,sra_galaxy,cram_index_ftp,cram_index_galaxy&download=txt",
26 |                     header=T)
27 | 
28 | accessions <- unique(study$sample_accession)
29 | expect_that(length(accessions), equals(279))
30 | 
31 | simons_attributes <- function(x) {
32 |   raw <- xmlParse(paste0("http://www.ebi.ac.uk/ena/data/view/", x, "%26display%3Dxml"))
33 |   parsed <- xpathApply(raw, "//ROOT/SAMPLE/SAMPLE_ATTRIBUTES", xmlToDataFrame)
34 |   long <- mutate(parsed[[1]],
35 |                  era_id=x)
36 | }
37 | 
38 | all <- do.call(rbind, lapply(accessions, simons_attributes))
39 | expect_that(ncol(all), equals(3))
40 | 
41 | # Fix attribute name "Sex" versus "sex" by lower casing all attribute names.
42 | all$TAG = tolower(all$TAG)
43 | 
44 | # Fix attribute name "geographic location (country and/or sea)" versus "country".
45 | all$TAG =  gsub("geographic location (country and/or sea)",
46 |                 "country",
47 |                 all$TAG,
48 |                 fixed=TRUE)
49 | 
50 | # Reshape long attribute list into wide format.
51 | wide <- reshape(all, idvar = "era_id", timevar="TAG", direction = "wide")
52 | expect_that(nrow(wide), equals(279))
53 | 
54 | # Tidy up the column names.
55 | colnames(wide) <- gsub("VALUE.", "", colnames(wide))
56 | colnames(wide) <- gsub("-", "_", colnames(wide))
57 | colnames(wide) <- gsub(" ", "_", colnames(wide))
58 | 
59 | # In two cases the library name instead of the Illumina ID is what ended up in the VCF file.
60 | # SS6004478 == LP6005442-DNA_A09 per http://www.ebi.ac.uk/ena/data/view/SAMEA3302719
61 | # SS6004477 == LP6005442-DNA_B09 per http://www.ebi.ac.uk/ena/data/view/SAMEA3302681
62 | 
63 | # There is one final case where the Illumina ID does not match the id in any VCF, so by
64 | # process of elimination, remapping that one too.
65 | # LP6005443-DNA_C01 == LP6005441-DNA_A09 per process of elimination
66 | 
67 | # Add a new column holding repaired values.
68 | wide_remapped = mutate(wide,
69 |                        id_from_vcf=illumina_id)
70 | wide_remapped$id_from_vcf = gsub("LP6005442-DNA_A09",
71 |                                  "SS6004478",
72 |                                  wide_remapped$id_from_vcf,
73 |                                  fixed=TRUE)
74 | wide_remapped$id_from_vcf = gsub("LP6005442-DNA_B09",
75 |                                  "SS6004477",
76 |                                  wide_remapped$id_from_vcf,
77 |                                  fixed=TRUE)
78 | wide_remapped$id_from_vcf = gsub("LP6005441-DNA_A09",
79 |                                  "LP6005443-DNA_C01",
80 |                                  wide_remapped$id_from_vcf,
81 |                                  fixed=TRUE)
82 | 
83 | write.csv(wide_remapped, "simons-sample-attributes.csv", row.names=FALSE, na="")
84 | 
85 | # Then load the resulting file to BigQuery via:
86 | # bq load --autodetect THE_DATASET.THE_TABLE simons-sample-attributes.csv
87 | 


--------------------------------------------------------------------------------
/1000genomes/sql/gwas-pattern-chi-squared-test.sql:
--------------------------------------------------------------------------------
  1 | # An example of a pattern one might use for GWAS queries upon 1,000
  2 | # Genomes variants.  It is specifically examining differences allelic
  3 | # frequency for variants upon chromosome 12 between the EAS super
  4 | # population versus all other individuals, returning a ranked list of
  5 | # variants by decreasing variation between groups.  Note that this
  6 | # particular query below is naive in many, many respects and is merely
  7 | # meant as an over-simplified example that might help domain experts
  8 | # translate their scientifically correct data filtering and
  9 | # statistical methods to BigQuery.  Feedback to improve this query is
 10 | # most welcome!
 11 | 
 12 | # http://www.statisticslectures.com/topics/goodnessoffit/
 13 | # http://homes.cs.washington.edu/~suinlee/genome560/lecture7.pdf
 14 | # http://bioinformatics.ca/files/Statistics/Statistics_Day2-Module8.pdf
 15 | # Chi-squared critical value for df=1, p-value=5*10^-8 is 29.71679
 16 | # > qchisq(1 - 5e-08, df=1)
 17 | #   [1] 29.71679
 18 | 
 19 | SELECT
 20 |   reference_name,
 21 |   start,
 22 |   end,
 23 |   reference_bases,
 24 |   alternate_bases,
 25 |   vt,
 26 |   case_count,
 27 |   control_count,
 28 |   allele_count,
 29 |   ref_count,
 30 |   alt_count,
 31 |   case_ref_count,
 32 |   case_alt_count,
 33 |   control_ref_count,
 34 |   control_alt_count,
 35 |   # https://en.wikipedia.org/wiki/Yates%27s_correction_for_continuity
 36 |   ROUND(
 37 |     POW(ABS(case_ref_count - (ref_count/allele_count)*case_count) - 0.5,
 38 |       2)/((ref_count/allele_count)*case_count) +
 39 |     POW(ABS(control_ref_count - (ref_count/allele_count)*control_count) - 0.5,
 40 |       2)/((ref_count/allele_count)*control_count) +
 41 |     POW(ABS(case_alt_count - (alt_count/allele_count)*case_count) - 0.5,
 42 |       2)/((alt_count/allele_count)*case_count) +
 43 |     POW(ABS(control_alt_count - (alt_count/allele_count)*control_count) - 0.5,
 44 |       2)/((alt_count/allele_count)*control_count),
 45 |     3) AS chi_squared_score
 46 | FROM (
 47 |   SELECT
 48 |     reference_name,
 49 |     start,
 50 |     end,
 51 |     reference_bases,
 52 |     alternate_bases,
 53 |     vt,
 54 |     SUM(ref_count + alt_count) AS allele_count,
 55 |     SUM(ref_count) AS ref_count,
 56 |     SUM(alt_count) AS alt_count,
 57 |     SUM(IF(TRUE = is_case, INTEGER(ref_count + alt_count), 0)) AS case_count,
 58 |     SUM(IF(FALSE = is_case, INTEGER(ref_count + alt_count), 0)) AS control_count,
 59 |     SUM(IF(TRUE = is_case, ref_count, 0)) AS case_ref_count,
 60 |     SUM(IF(TRUE = is_case, alt_count, 0)) AS case_alt_count,
 61 |     SUM(IF(FALSE = is_case, ref_count, 0)) AS control_ref_count,
 62 |     SUM(IF(FALSE = is_case, alt_count, 0)) AS control_alt_count,
 63 |   FROM (
 64 |     SELECT
 65 |       reference_name,
 66 |       start,
 67 |       ('EAS' = super_population) AS is_case,
 68 |       reference_bases,
 69 |       alternate_bases,
 70 |       END,
 71 |       vt,
 72 |       # 1000 genomes phase 1 data is bi-allelic so there is only ever a single alt
 73 |       SUM(0 = call.genotype) WITHIN RECORD AS ref_count,
 74 |       SUM(1 = call.genotype) WITHIN RECORD AS alt_count,
 75 |     FROM
 76 |       FLATTEN((
 77 |         SELECT
 78 |           reference_name,
 79 |           start,
 80 |           reference_bases,
 81 |           alternate_bases,
 82 |           END,
 83 |           vt,
 84 |           call.call_set_name,
 85 |           call.genotype,
 86 |         FROM
 87 |           [genomics-public-data:1000_genomes.variants]
 88 |         WHERE
 89 |           reference_name = '12'
 90 |           ),
 91 |         call) AS g
 92 |     JOIN
 93 |       [genomics-public-data:1000_genomes.sample_info] p
 94 |     ON
 95 |       g.call.call_set_name = p.sample
 96 |       )
 97 |   GROUP BY
 98 |     reference_name,
 99 |     start,
100 |     end,
101 |     reference_bases,
102 |     alternate_bases,
103 |     vt)
104 | WHERE
105 |   # For chi-squared, expected counts must be at least 5 for each group
106 |   (ref_count/allele_count)*case_count >= 5.0
107 |   AND (ref_count/allele_count)*control_count >= 5.0
108 |   AND (alt_count/allele_count)*case_count >= 5.0
109 |   AND (alt_count/allele_count)*control_count >= 5.0
110 | HAVING
111 |   # Chi-squared critical value for df=1, p-value=5*10^-8 is 29.71679
112 |   chi_squared_score >= 29.71679
113 | ORDER BY
114 |   chi_squared_score DESC,
115 |   allele_count DESC
116 | 


--------------------------------------------------------------------------------
/1000genomes/sql/hardy-weinberg-equilibrium.sql:
--------------------------------------------------------------------------------
  1 | # An example of a pattern one might use for Hardy-Weinberg Equilibrium
  2 | # queries upon 1,000 Genomes variants.  It is specifically computing
  3 | # the Hardy-Weinberg Equilibrium for the variants found in BRCA1 and
  4 | # then computing the chi-squared score for the observed versus
  5 | # expected counts for the calls.
  6 | 
  7 | # http://scienceprimer.com/hardy-weinberg-equilibrium-calculator
  8 | # http://www.nfstc.org/pdi/Subject07/pdi_s07_m01_02.htm
  9 | # http://www.nfstc.org/pdi/Subject07/pdi_s07_m01_02.p.htm
 10 | 
 11 | SELECT
 12 |   reference_name,
 13 |   start,
 14 |   END,
 15 |   reference_bases,
 16 |   alt,
 17 |   vt,
 18 |   ROUND(POW(hom_ref_count - expected_hom_ref_count,
 19 |       2)/expected_hom_ref_count +
 20 |     POW(hom_alt_count - expected_hom_alt_count,
 21 |       2)/expected_hom_alt_count +
 22 |     POW(het_count - expected_het_count,
 23 |       2)/expected_het_count,
 24 |     3) AS chi_squared_score,
 25 |   total_count,
 26 |   hom_ref_count,
 27 |   ROUND(expected_hom_ref_count,
 28 |     2) AS expected_hom_ref_count,
 29 |   het_count,
 30 |   ROUND(expected_het_count,
 31 |     2) AS expected_het_count,
 32 |   hom_alt_count,
 33 |   ROUND(expected_hom_alt_count,
 34 |     2) AS expected_hom_alt_count,
 35 |   ROUND(alt_freq,
 36 |     4) AS alt_freq,
 37 |   alt_freq_from_1KG,
 38 | FROM (
 39 |   SELECT
 40 |     reference_name,
 41 |     start,
 42 |     END,
 43 |     reference_bases,
 44 |     alt,
 45 |     vt,
 46 |     alt_freq_from_1KG,
 47 |     hom_ref_freq + (.5 * het_freq) AS hw_ref_freq,
 48 |     1 - (hom_ref_freq + (.5 * het_freq)) AS alt_freq,
 49 |     POW(hom_ref_freq + (.5 * het_freq),
 50 |       2) * total_count AS expected_hom_ref_count,
 51 |     POW(1 - (hom_ref_freq + (.5 * het_freq)),
 52 |       2) * total_count AS expected_hom_alt_count,
 53 |     2 * (hom_ref_freq + (.5 * het_freq))
 54 |     * (1 - (hom_ref_freq + (.5 * het_freq)))
 55 |     * total_count AS expected_het_count,
 56 |     total_count,
 57 |     hom_ref_count,
 58 |     het_count,
 59 |     hom_alt_count,
 60 |     hom_ref_freq,
 61 |     het_freq,
 62 |     hom_alt_freq,
 63 |   FROM (
 64 |     SELECT
 65 |       reference_name,
 66 |       start,
 67 |       END,
 68 |       reference_bases,
 69 |       alt,
 70 |       vt,
 71 |       alt_freq_from_1KG,
 72 |       # 1000 genomes data IS bi-allelic so there IS only ever a single alt
 73 |       # We also exclude calls _where one _or both alleles were NOT called (-1)
 74 |       SUM((0 = first_allele
 75 |           OR 1 = first_allele)
 76 |         AND (0 = second_allele
 77 |           OR 1 = second_allele)) WITHIN RECORD AS total_count,
 78 |       SUM(0 = first_allele
 79 |         AND 0 = second_allele) WITHIN RECORD AS hom_ref_count,
 80 |       SUM((0 = first_allele
 81 |           AND 1 = second_allele)
 82 |         OR (1 = first_allele
 83 |           AND 0 = second_allele)) WITHIN RECORD AS het_count,
 84 |       SUM(1 = first_allele
 85 |         AND 1 = second_allele) WITHIN RECORD AS hom_alt_count,
 86 |       SUM(0 = first_allele
 87 |         AND 0 = second_allele) / SUM((0 = first_allele
 88 |           OR 1 = first_allele)
 89 |         AND (0 = second_allele
 90 |           OR 1 = second_allele)) WITHIN RECORD AS hom_ref_freq,
 91 |       SUM((0 = first_allele
 92 |           AND 1 = second_allele)
 93 |         OR (1 = first_allele
 94 |           AND 0 = second_allele)) / SUM((0 = first_allele
 95 |           OR 1 = first_allele)
 96 |         AND (0 = second_allele
 97 |           OR 1 = second_allele)) WITHIN RECORD AS het_freq,
 98 |       SUM(1 = first_allele
 99 |         AND 1 = second_allele) / SUM((0 = first_allele
100 |           OR 1 = first_allele)
101 |         AND (0 = second_allele
102 |           OR 1 = second_allele)) WITHIN RECORD AS hom_alt_freq,
103 |     FROM (
104 |       SELECT
105 |         reference_name,
106 |         start,
107 |         END,
108 |         reference_bases,
109 |         GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt,
110 |         vt,
111 |         # Also return the pre-computed allelic frequency to help us check our work
112 |         af AS alt_freq_from_1KG,
113 |         NTH(1,
114 |           call.genotype) WITHIN call AS first_allele,
115 |         NTH(2,
116 |           call.genotype) WITHIN call AS second_allele,
117 |       FROM
118 |         [genomics-public-data:1000_genomes.variants]
119 |       WHERE
120 |         reference_name = '17'
121 |         AND start BETWEEN 41196311
122 |         AND 41277499
123 |         )))
124 | ORDER BY
125 |   reference_name,
126 |   start
127 | 


--------------------------------------------------------------------------------
/1000genomes/data-stories/reproducing-hardy-weinberg-equilibrium/README.Rmd:
--------------------------------------------------------------------------------
  1 | <!-- R Markdown Documentation, DO NOT EDIT THE PLAIN MARKDOWN VERSION OF THIS FILE -->
  2 | 
  3 | <!-- Copyright 2014 Google Inc. All rights reserved. -->
  4 | 
  5 | <!-- Licensed under the Apache License, Version 2.0 (the "License"); -->
  6 | <!-- you may not use this file except in compliance with the License. -->
  7 | <!-- You may obtain a copy of the License at -->
  8 | 
  9 | <!--     http://www.apache.org/licenses/LICENSE-2.0 -->
 10 | 
 11 | <!-- Unless required by applicable law or agreed to in writing, software -->
 12 | <!-- distributed under the License is distributed on an "AS IS" BASIS, -->
 13 | <!-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -->
 14 | <!-- See the License for the specific language governing permissions and -->
 15 | <!-- limitations under the License. -->
 16 | 
 17 | Reproducing the Hardy-Weinberg Equilibrium test for BRCA1 in 1,000 Genomes
 18 | ========================================================
 19 | 
 20 | Provenance for the expected result
 21 | ---------------------------
 22 | First get a slice of the VCF containing just the variants within BRCA1:
 23 | ```
 24 | vcftools --gzvcf ALL.chr17.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz --chr 17 --from-bp 41196312 --to-bp 41277500 --out brca1  --recode-INFO-all --recode
 25 | 
 26 | VCFtools - v0.1.11
 27 | (C) Adam Auton 2009
 28 | 
 29 | Parameters as interpreted:
 30 |   --gzvcf ALL.chr17.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz
 31 | 	--chr 17
 32 | 	--to-bp 41277500
 33 | 	--recode-INFO-all
 34 | 	--out brca1
 35 | 	--recode
 36 | 	--from-bp 41196312
 37 | 
 38 | Using zlib version: 1.2.3.4
 39 | Versions of zlib >= 1.2.4 will be *much* faster when reading zipped VCF files.
 40 | Reading Index file.
 41 | File contains 1046733 entries and 1092 individuals.
 42 | Filtering by chromosome.
 43 | 	Chromosome: 17
 44 | Keeping 1046733 entries on specified chromosomes.
 45 | Applying Required Filters.
 46 | Filtering sites by chromosome and/or position
 47 | After filtering, kept 1092 out of 1092 Individuals
 48 | After filtering, kept 879 out of a possible 1046733 Sites
 49 | Outputting VCF file... Done
 50 | Run Time = 200.00 seconds
 51 | ```
 52 | Then run vcftools:
 53 | ```
 54 | vcftools --vcf brca1.recode.vcf --hardy
 55 | ```
 56 | Producing output file: [out.hwe](./vcftools-output/out.hwe)
 57 | 
 58 | See [details](http://vcftools.sourceforge.net/man_latest.html#OUTPUT OPTIONS) about the --hardy option for vcftools for more detail about the calculaton.
 59 | 
 60 | Reproducing the result via BigQuery
 61 | ------------------------------------
 62 | [BRCA1](http://www.genecards.org/cgi-bin/carddisp.pl?gene=BRCA1) resides on chromosome 17 from position 41196312 to 41277500.
 63 | 
 64 | ```{r init, echo=FALSE, message=FALSE, warning=FALSE, comment=NA}
 65 | require(bigrquery)
 66 | require(ggplot2)
 67 | require(dplyr)
 68 | require(xtable)
 69 | require(testthat)
 70 | project <- "google.com:biggene" # put your projectID here
 71 | DisplayAndDispatchQuery <- function(queryUri) {
 72 |   querySql <- readChar(queryUri, nchars=1e6)
 73 |   cat(querySql)
 74 |   query_exec(querySql, project)
 75 | }
 76 | ```
 77 | 
 78 | Let’s compute the Hardy-Weinberg Equilibrium test for each variant within BRCA1:
 79 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA}
 80 | result <- DisplayAndDispatchQuery("../../sql/hardy-weinberg-equilibrium.sql")
 81 | ```
 82 | Number of rows returned by this query: `r nrow(result)`.
 83 | 
 84 | Displaying the first few rows of our result:
 85 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"}
 86 | print(head(xtable(result), 10), type="html", include.rownames=F)
 87 | ```
 88 | and the last few rows:
 89 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"}
 90 | print(tail(xtable(result), 10), type="html", include.rownames=F)
 91 | ```
 92 | 
 93 | And do our results match the precomputed values resident in the AF INFO field?
 94 | ```{r}
 95 | print(expect_equal(object=result$alt_freq,
 96 |                    expected=result$alt_freq_from_1KG,
 97 |                    tolerance=0.005,
 98 |                    scale=1))
 99 | ```
100 | We can see from the results that when the computed frequency values in column alt_freq are rounded, they exactly match the alternate allele frequencies as reported in the AF INFO field from the 1,000 Genomes VCF data.
101 | 
102 | Most importantly, comparing these to the results in [out.hwe](./vcftools-output/out.hwe) from vcftools we see that the test scores match.
103 | 


--------------------------------------------------------------------------------
/pgp/sql/schema-comparisons/missingness-udf.sql:
--------------------------------------------------------------------------------
  1 | # Missingness rate summarized per chromosome.  To see it per variant, materialize 
  2 | # the large result from the inner query to a table.
  3 | #
  4 | # Note that the new BigQuery feature of user-defined javascript
  5 | # functions is in limited preview.  For more info, see
  6 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377
  7 | SELECT
  8 |   contig_name,
  9 |   MIN(missingness_rate) AS min_missingness,
 10 |   AVG(missingness_rate) AS avg_missingness,
 11 |   MAX(missingness_rate) AS max_missingness,
 12 |   STDDEV(missingness_rate) AS stddev_missingness,
 13 | FROM (
 14 |   SELECT
 15 |     vars.contig_name AS contig_name,
 16 |     vars.start_pos AS start_pos,
 17 |     reference_bases,
 18 |     variant_called_count,
 19 |     SUM(refs.called_count) AS reference_called_count,
 20 |     variant_called_count + SUM(refs.called_count) AS num_alleles_called_for_position,
 21 |     1 - ((variant_called_count + SUM(refs.called_count))/(172*2)) AS missingness_rate
 22 |   FROM (
 23 |     # _JOIN our variant sample counts with the corresponding reference-matching blocks
 24 |     SELECT
 25 |       vars.contig_name,
 26 |       vars.start_pos,
 27 |       refs.start_pos,
 28 |       vars.end_pos,
 29 |       refs.the_end,
 30 |       reference_bases,
 31 |       variant_called_count,
 32 |       refs.called_count
 33 |     FROM js(
 34 |       # Constrain the left hand side of the _JOIN to reference-matching blocks
 35 |       (SELECT
 36 |          contig_name,
 37 |          start_pos,
 38 |          END,
 39 |          IF(alternate_bases IS NULL,
 40 |            FALSE,
 41 |            TRUE) AS is_variant_call,
 42 |          SUM(call.genotype >= 0) WITHIN RECORD AS called_count,
 43 |        FROM
 44 |          [google.com:biggene:pgp.gvcf_variants]
 45 |        HAVING
 46 |          is_variant_call = FALSE),
 47 |       contig_name, start_pos, END, called_count,
 48 |       # This User-defined function helps us reduce the size of the cross product
 49 |       # considered by this JOIN thereby greatly speeding up the query.
 50 |       "[{name: 'contig_name', type: 'string'},
 51 |         {name: 'start_pos', type: 'integer'},
 52 |         {name: 'the_end', type: 'integer'},
 53 |         {name: 'bin', type: 'integer'},
 54 |         {name: 'called_count', type: 'integer'}]",
 55 |        "function(r, emit) {
 56 |             var binSize = 5000
 57 |             var startBin = Math.floor(r.start_pos / binSize);
 58 |             var endBin = Math.floor(r.END / binSize);
 59 |             // Since a reference-matching block can span multiple bins, emit
 60 |             // a record for each bin.
 61 |             for(var bin = startBin; bin <= endBin; bin++) {
 62 |               emit({
 63 |                 contig_name: r.contig_name,
 64 |                 start_pos: r.start_pos,
 65 |                 the_end: r.END,
 66 |                 bin: bin,
 67 |                 called_count: r.called_count
 68 |               });
 69 |             }
 70 |         }") AS refs
 71 |     JOIN EACH (
 72 |       # Constrain the right hand side of the _JOIN to variants
 73 |       # _GROUP our variant sample counts together since a single SNP may be IN more than
 74 |       # one row due 1 / 2 genotypes
 75 |       SELECT
 76 |         contig_name,
 77 |         start_pos,
 78 |         end_pos,
 79 |         INTEGER(FLOOR(start_pos / 5000)) AS bin,
 80 |         reference_bases,
 81 |         SUM(called_count) AS variant_called_count,
 82 |       FROM (
 83 |         # _LIMIT the query to SNPs
 84 |         SELECT
 85 |           contig_name,
 86 |           start_pos,
 87 |           end_pos,
 88 |           reference_bases,
 89 |           LENGTH(reference_bases) AS ref_len,
 90 |           MIN(LENGTH(alternate_bases)) WITHIN RECORD AS alt_len,
 91 |           IF(alternate_bases IS NULL,
 92 |             FALSE,
 93 |             TRUE) AS is_variant_call,
 94 |           SUM(call.genotype >= 0) WITHIN RECORD AS called_count,
 95 |         FROM
 96 |           [google.com:biggene:pgp.gvcf_variants]
 97 |         HAVING
 98 |           ref_len = 1
 99 |           AND alt_len = 1
100 |           AND is_variant_call)
101 |       GROUP EACH BY
102 |         contig_name,
103 |         start_pos,
104 |         end_pos,
105 |         bin,
106 |         reference_bases) AS vars
107 |     # The _JOIN criteria IS complicated since we are trying to see if a SNP overlaps an interval
108 |     ON
109 |       vars.contig_name = refs.contig_name
110 |       AND vars.bin = refs.bin
111 |     WHERE
112 |       refs.start_pos <= vars.start_pos
113 |       AND refs.the_end >= vars.end_pos
114 |       )
115 |   GROUP EACH BY
116 |     contig_name,
117 |     start_pos,
118 |     reference_bases,
119 |     variant_called_count
120 |     )
121 | GROUP BY
122 |   contig_name
123 | ORDER BY
124 |   contig_name


--------------------------------------------------------------------------------
/pgp/sql/cgi_variants/allelic-frequency-brca1.sql:
--------------------------------------------------------------------------------
  1 | # The following query computes the allelic frequency for BRCA1 variants in the 
  2 | # PGP dataset.
  3 | #
  4 | # Note that the new BigQuery feature of user-defined javascript
  5 | # functions is in limited preview.  For more info, see
  6 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377
  7 | SELECT
  8 |   vars.chromosome AS chromosome,
  9 |   vars.reference AS reference,
 10 |   vars.locusBegin AS locusBegin,
 11 |   vars.locusEnd AS locusEnd,
 12 |   vars.allele AS allele,
 13 |   alternate_allele_count,
 14 |   num_samples_called,
 15 |   ROUND(alternate_allele_count / (2*num_samples_called),
 16 |     4) AS freq,
 17 | FROM (
 18 |   SELECT
 19 |     vars.chromosome,
 20 |     vars.reference,
 21 |     vars.locusBegin,
 22 |     vars.locusEnd,
 23 |     vars.allele,
 24 |     alternate_allele_count,
 25 |     SUM(num_samples) AS num_samples_called
 26 |   FROM (
 27 |     # The left hand side of our JOIN are counts of alternate allele values at 
 28 |     # a particular locus
 29 |     SELECT
 30 |       chromosome,
 31 |       reference,
 32 |       INTEGER(FLOOR(locusBegin / 5000)) AS bin,
 33 |       locusBegin,
 34 |       locusEnd,
 35 |       allele,
 36 |       SUM(cnt) AS alternate_allele_count,
 37 |     FROM (
 38 |       SELECT
 39 |         chromosome,
 40 |         reference,
 41 |         locusBegin,
 42 |         locusEnd,
 43 |         allele1Seq AS allele,
 44 |         COUNT(1) AS cnt
 45 |       FROM
 46 |         [google.com:biggene:pgp.cgi_variants]
 47 |       WHERE
 48 |         chromosome = 'chr17'
 49 |         AND locusBegin BETWEEN 41196311
 50 |         AND 41277499
 51 |         AND (reference != '=' OR reference IS NULL)
 52 |         AND allele1Seq != '?'
 53 |         AND (reference != allele1Seq OR reference IS NULL)
 54 |       GROUP BY
 55 |         chromosome,
 56 |         reference,
 57 |         locusBegin,
 58 |         locusEnd,
 59 |         allele),
 60 |       (
 61 |       SELECT
 62 |         chromosome,
 63 |         reference,
 64 |         locusBegin,
 65 |         locusEnd,
 66 |         allele2Seq AS allele,
 67 |         COUNT(1) AS cnt
 68 |       FROM
 69 |         [google.com:biggene:pgp.cgi_variants]
 70 |       WHERE
 71 |         chromosome = 'chr17'
 72 |         AND locusBegin BETWEEN 41196311
 73 |         AND 41277499
 74 |         AND (reference != '=' OR reference IS NULL)
 75 |         AND allele2Seq != '?'
 76 |         AND (reference != allele2Seq OR reference IS NULL)
 77 |       GROUP BY
 78 |         chromosome,
 79 |         reference,
 80 |         locusBegin,
 81 |         locusEnd,
 82 |         allele)
 83 |     GROUP BY
 84 |       chromosome,
 85 |       reference,
 86 |       bin,
 87 |       locusBegin,
 88 |       locusEnd,
 89 |       allele) AS vars
 90 |   JOIN
 91 |     EACH (
 92 |     # The right hand side of our JOIN is are all the calls, including
 93 |     # reference calls and no-calls
 94 |     SELECT
 95 |       num_samples,
 96 |       chromosome,
 97 |       bin,
 98 |       locusBegin,
 99 |       locusEnd
100 |     FROM (
101 |       SELECT
102 |         COUNT(sample_id) AS num_samples,
103 |         chromosome,
104 |         reference,
105 |         bin,
106 |         locusBegin,
107 |         locusEnd
108 |       # This User-defined function helps us reduce the size of the cross product
109 |       # considered by this JOIN thereby greatly speeding up the query
110 |       FROM js(
111 |       (SELECT sample_id, chromosome, reference, locusBegin, locusEnd,
112 |        FROM [google.com:biggene:pgp.cgi_variants]
113 |        WHERE chromosome = 'chr17'),
114 |       sample_id, chromosome, reference, locusBegin, locusEnd,
115 |       "[{name: 'sample_id', type: 'string'},
116 |         {name: 'chromosome', type: 'string'},
117 |         {name: 'reference', type: 'string'},
118 |         {name: 'bin', type: 'integer'},
119 |         {name: 'locusBegin', type: 'integer'},
120 |         {name: 'locusEnd', type: 'integer'}]",
121 |        "function(r, emit) {
122 |             var binSize = 5000
123 |             var startBin = Math.floor(r.locusBegin / binSize);
124 |             var endBin = Math.floor(r.locusEnd / binSize);
125 |             for(var bin = startBin; bin <= endBin; bin++) {
126 |               emit({
127 |                 sample_id: r.sample_id,
128 |                 chromosome: r.chromosome,
129 |                 reference: r.reference,
130 |                 bin: bin,
131 |                 locusBegin: r.locusBegin,
132 |                 locusEnd: r.locusEnd,
133 |               });
134 |             }
135 |         }")
136 |         GROUP EACH BY
137 |         chromosome,
138 |         reference,
139 |         bin,
140 |         locusBegin,
141 |         locusEnd
142 |         )) AS all
143 |   ON
144 |     vars.chromosome = all.chromosome
145 |     AND vars.bin = all.bin
146 |   WHERE
147 |     # Further constrain the JOIN to calls that overlapped the first base pair
148 |     # of this variant
149 |     all.locusBegin <= vars.locusBegin
150 |     AND all.locusEnd >= vars.locusBegin+1
151 |   GROUP BY
152 |     vars.chromosome,
153 |     vars.reference,
154 |     vars.locusBegin,
155 |     vars.locusEnd,
156 |     vars.allele,
157 |     alternate_allele_count
158 |     )
159 | ORDER BY
160 |   chromosome,
161 |   locusBegin
162 | 


--------------------------------------------------------------------------------
/pgp/provenance/cgi-ref-blocks-mapper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright 2014 Google Inc. All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | """Add extract reference-matching records from CGI data and conver to VCF.
 18 | 
 19 | Assumptions:
 20 | - one sample per input file
 21 | 
 22 | 
 23 | This script can be run standalone:
 24 |    cat masterVarBeta-GS000016446-ASM.tsv | ./cgi-ref-blocks-mapper.py
 25 | 
 26 | Or via the debugger:
 27 |    python -mpdb ./cgi-ref-blocks-mapper.py masterVarBeta-GS000016446-ASM.tsv
 28 | 
 29 | To have the sample id correctly parsed when input is from stdin, set the 
 30 | environment variable that Hadoop would set:
 31 |    export map_input_file=./huDBF9DD/masterVarBeta-GS000016446-ASM.tsv.bz2
 32 |    bzcat ./huDBF9DD/masterVarBeta-GS000016446-ASM.tsv.bz2 | ./cgi-ref-blocks-mapper.py
 33 | 
 34 | To have the sample id correctly parsed when input is from a file, ensure that it
 35 | is in the file path:
 36 |    python -mpdb ./cgi-mapper.py ./huDBF9DD/masterVarBeta-GS000016446-ASM.tsv
 37 | 
 38 | It should be run as a mapper-only Hadoop Streaming job:
 39 |   hadoop jar /path/to/your/hadoop-streaming-*.jar \
 40 |   -libjars /home/deflaux/custom.jar \
 41 |   -outputformat com.custom.CustomMultiOutputFormat \
 42 |   -mapper cgi-ref-blocks-mapper.py -file cgi-ref-blocks-mapper.py \
 43 |   --numReduceTasks 0 -input inputpath -output outputpath
 44 | 
 45 | Notice that there is a special output format to put the VCF header
 46 | back into the output files including the specific sample id.
 47 | 
 48 | See also https://cloud.google.com/hadoop/ and
 49 | http://stackoverflow.com/questions/18541503/multiple-output-files-for-hadoop-streaming-with-python-mapper
 50 | 
 51 | """
 52 | 
 53 | import os
 54 | import re
 55 | import sys
 56 | 
 57 | ### Constants
 58 | INPUT_FILE_KEY = "map_input_file"
 59 | SAMPLE_ID_PATTERN = "/(hu[A-F0-9]{6})/"
 60 | # This genome was sequenced twice, this is the path of the older of the two
 61 | DUPLICATE_GENOME = "gs://pgp-harvard-data-public/hu34D5B9/GS000012763-DID/GS000010327-ASM/GS01173-DNA_C07/ASM/masterVarBeta-GS000010327-ASM.tsv.bz2"
 62 | # These genomes did not successfully get converted to VCF by cgatools mkvcf
 63 | MKVCF_FAILED_GENOMES = ["huEDF7DA", "hu34D5B9"]
 64 | 
 65 | # CGI masterVar field indices
 66 | CHROMOSOME = 2
 67 | LOCUS_BEGIN = 3
 68 | LOCUS_END = 4
 69 | REFERENCE = 7
 70 | ALLELE1SEQ = 8
 71 | ALLELE2SEQ = 9
 72 | 
 73 | 
 74 | def main():
 75 |   """Entry point to the script."""
 76 | 
 77 |   sample_id = None
 78 |   sample_id_re = re.compile(SAMPLE_ID_PATTERN)
 79 | 
 80 |   # Basic parsing of command line arguments to allow a filename
 81 |   # to be passed when running this code in the debugger.
 82 |   path = None
 83 |   file_handle = sys.stdin
 84 |   if 2 <= len(sys.argv):
 85 |     path = sys.argv[1]
 86 |     file_handle = open(path, "r")
 87 |   elif INPUT_FILE_KEY in os.environ:
 88 |     path = os.environ[INPUT_FILE_KEY]
 89 |     print >> sys.stderr, path
 90 | 
 91 |   if path is not None:
 92 |     match = sample_id_re.search(path)
 93 |     if match:
 94 |       sample_id = match.group(1)
 95 | 
 96 |   line = file_handle.readline()
 97 |   while line:
 98 |     line = line.rstrip("\n")
 99 | 
100 |     if DUPLICATE_GENOME == path:
101 |       # hu34D5B9 was sequenced twice, skip the older genome
102 |       pass
103 |     elif sample_id in MKVCF_FAILED_GENOMES:
104 |       # Don't bother extracting ref-matching blocks for the genomes for which
105 |       # we were unable to run cgatools mkvcf
106 |       pass
107 |     elif not line:
108 |       # This is a blank line, skip it
109 |       pass
110 |     elif "#" == line[0]:
111 |       # This is a header line, skip it
112 |       pass
113 |     elif ">" == line[0]:
114 |       # This is the column header line, skip it
115 |       pass
116 |     else:
117 |       fields = line.split("\t")
118 |       if ("=" == fields[REFERENCE] and "=" == fields[ALLELE1SEQ]
119 |           and ("=" == fields[ALLELE2SEQ] or "" == fields[ALLELE2SEQ])):
120 |         # This is a reference-matching record, emit it
121 |         contig = fields[CHROMOSOME].replace("chr", "", 1)
122 |         start_pos = int(fields[LOCUS_BEGIN]) + 1
123 |         end = int(fields[LOCUS_END])
124 |         # The key is used by the custom output format to put the
125 |         # resulting files in a subdirectory specific to the sample
126 |         # and also as part of one of the VCF header lines.
127 |         key = sample_id
128 |         value = "%s\t%d\t.\tN\t.\t.\t.\tNS=1;AN=0;END=%d\tGT:PS\t0/0:." % (
129 |             contig, start_pos, end)
130 |         print "%s\t%s" % (key, value)
131 | 
132 |     line = file_handle.readline()
133 | 
134 | if __name__ == "__main__":
135 |   main()
136 | 


--------------------------------------------------------------------------------
/pgp/sql/gvcf_variants/allelic-frequency-brca1.sql:
--------------------------------------------------------------------------------
  1 | # The following query computes the allelic frequency for BRCA1 variants in the 
  2 | # PGP dataset.
  3 | #
  4 | # Note that the new BigQuery feature of user-defined javascript
  5 | # functions is in limited preview.  For more info, see
  6 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377
  7 | SELECT
  8 |   vars.contig_name AS contig_name,
  9 |   vars.reference_bases AS reference_bases,
 10 |   vars.start_pos AS start_pos,
 11 |   vars.alternate_bases AS allele,
 12 |   alternate_allele_count,
 13 |   num_alleles_called,
 14 |   ROUND(alternate_allele_count / num_alleles_called,
 15 |     4) AS freq,
 16 | FROM (
 17 |   SELECT
 18 |     vars.contig_name,
 19 |     vars.reference_bases,
 20 |     vars.start_pos,
 21 |     vars.alternate_bases,
 22 |     alternate_allele_count,
 23 |     SUM(num_alleles_called) AS num_alleles_called,
 24 |   FROM (
 25 |     # The left hand side of our JOIN is are all the calls, including
 26 |     # reference_bases calls and no-calls
 27 |     SELECT
 28 |         SUM(num_alleles_called) AS num_alleles_called,
 29 |         contig_name,
 30 |         reference_bases,
 31 |         bin,
 32 |         start_pos,
 33 |         the_end,
 34 |       # This User-defined function helps us reduce the size of the cross product
 35 |       # considered by this JOIN thereby greatly speeding up the query
 36 |       FROM js(
 37 |       (SELECT contig_name, reference_bases, start_pos, end_pos, END, call.genotype,
 38 |        FROM [google.com:biggene:pgp.gvcf_variants]
 39 |        WHERE contig_name = '17'),
 40 |       contig_name, reference_bases, start_pos, end_pos, END, call.genotype,
 41 |       "[{name: 'num_alleles_called', type: 'integer'},
 42 |         {name: 'contig_name', type: 'string'},
 43 |         {name: 'reference_bases', type: 'string'},
 44 |         {name: 'bin', type: 'integer'},
 45 |         {name: 'start_pos', type: 'integer'},
 46 |         {name: 'the_end', type: 'integer'}]",
 47 |        "function(r, emit) {
 48 |           var num_alleles_called = 0;
 49 |           for(var c in r.call) {
 50 |             for(var g in r.call[c].genotype) {
 51 |               if(0 <= r.call[c].genotype[g]) {
 52 |                 num_alleles_called++;
 53 |               }
 54 |             }
 55 |           }
 56 |           var binSize = 5000
 57 |           var startBin = Math.floor(r.start_pos / binSize);
 58 |           var theEnd = (r.END === null) ? r.end_pos : r.END;
 59 |           var endBin = Math.floor(theEnd / binSize);
 60 |           for(var bin = startBin; bin <= endBin; bin++) {
 61 |             emit({
 62 |               num_alleles_called: num_alleles_called,
 63 |               contig_name: r.contig_name,
 64 |               reference_bases: r.reference_bases,
 65 |               bin: bin,
 66 |               start_pos: r.start_pos,
 67 |               the_end: theEnd
 68 |             });
 69 |           }
 70 |         }")
 71 |         GROUP EACH BY
 72 |         contig_name,
 73 |         reference_bases,
 74 |         bin,
 75 |         start_pos,
 76 |         the_end
 77 |         ) AS all
 78 |   JOIN
 79 |     EACH 
 80 |     # The right hand side of our JOIN are counts of alternate allele values at 
 81 |     # a particular locus
 82 | (SELECT
 83 |   contig_name,
 84 |   start_pos,
 85 |   # This 'bin' can be use in subsequent interval JOINs
 86 |   INTEGER(FLOOR(start_pos / 5000)) AS bin,
 87 |   reference_bases,
 88 |   alternate_bases,
 89 |   SUM(alternate_allele_count) AS alternate_allele_count,
 90 | FROM (
 91 |   SELECT contig_name, start_pos, reference_bases, alternate_bases, alt_count
 92 |   FROM js(
 93 |     (SELECT contig_name, start_pos, reference_bases, alternate_bases, call.genotype,
 94 |      FROM [google.com:biggene:pgp.gvcf_variants]
 95 |      WHERE contig_name = '17'),
 96 |     contig_name, start_pos, reference_bases, alternate_bases, call.genotype,
 97 |       "[{name: 'contig_name', type: 'string'},
 98 |         {name: 'start_pos', type: 'integer'},
 99 |         {name: 'reference_bases', type: 'string'},
100 |         {name: 'alternate_bases', type: 'string'},
101 |         {name: 'alternate_allele_count', type: 'integer'}]",
102 |       "function(r, emit) {
103 |          for(var a in r.alternate_bases) {
104 |            var alt_gt = a + 1;
105 |            var alt_count = 0;
106 |            for(var c in r.call) {
107 |              for(var g in r.call[c].genotype) {
108 |                if(alt_gt == r.call[c].genotype[g]) {
109 |                  alt_count++;
110 |                }
111 |              }
112 |            }
113 |            // Emit one record per alt
114 |            emit({
115 |              contig_name: r.contig_name,
116 |              start_pos: r.start_pos,
117 |              reference_bases: r.reference_bases,
118 |              alternate_bases: r.alternate_bases[a],
119 |              alternate_allele_count: alt_count
120 |            });
121 |          }
122 |        }"))
123 | WHERE
124 |   contig_name = '17'
125 |   AND start_pos BETWEEN 41196312
126 |   AND 41277500
127 | GROUP EACH BY
128 |   contig_name,
129 |   start_pos,
130 |   bin,
131 |   reference_bases,
132 |   alternate_bases) AS vars
133 |   ON
134 |     vars.contig_name = all.contig_name
135 |     AND vars.bin = all.bin
136 |   WHERE
137 |     # Further constrain the JOIN to calls that overlapped the first base pair
138 |     # of this variant
139 |     all.start_pos <= vars.start_pos
140 |     AND all.the_end >= vars.start_pos+1
141 |   GROUP EACH BY
142 |     vars.contig_name,
143 |     vars.reference_bases,
144 |     vars.start_pos,
145 |     vars.alternate_bases,
146 |     alternate_allele_count
147 |     )
148 | 


--------------------------------------------------------------------------------
/1000genomes_phase3/README.Rmd:
--------------------------------------------------------------------------------
  1 | <!-- R Markdown Documentation, DO NOT EDIT THE PLAIN MARKDOWN VERSION OF THIS FILE -->
  2 | 
  3 | <!-- Copyright 2016 Google Inc. All rights reserved. -->
  4 | 
  5 | <!-- Licensed under the Apache License, Version 2.0 (the "License"); -->
  6 | <!-- you may not use this file except in compliance with the License. -->
  7 | <!-- You may obtain a copy of the License at -->
  8 | 
  9 | <!--     http://www.apache.org/licenses/LICENSE-2.0 -->
 10 | 
 11 | <!-- Unless required by applicable law or agreed to in writing, software -->
 12 | <!-- distributed under the License is distributed on an "AS IS" BASIS, -->
 13 | <!-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -->
 14 | <!-- See the License for the specific language governing permissions and -->
 15 | <!-- limitations under the License. -->
 16 | 
 17 | 1,000 Genomes Phase 3 Variants
 18 | ==============================
 19 | 
 20 | ### Additional Resources
 21 | * [Schema](https://bigquery.cloud.google.com/table/genomics-public-data:1000_genomes_phase_3.variants_20150220_release?pli=1)
 22 | * [Data Provenance](http://googlegenomics.readthedocs.io/en/latest/use_cases/discover_public_data/1000_genomes.html)
 23 | * [Index of variant analyses](./sql)
 24 | 
 25 | ### Metrics
 26 | 
 27 | In the following plots we display metrics computed on both 1,000 Genomes phase 1 and phase 3 variants.
 28 | 
 29 | ```{r echo=FALSE, eval=FALSE}
 30 | ######################[ CHANGE ME ]##################################
 31 | # This codelab assumes that the current working directory is where the Rmd file resides.
 32 | setwd("/YOUR/PATH/TO/bigquery-examples/1000genomes_phase3")
 33 | 
 34 | # Set the Google Cloud Platform project id under which these queries will run.
 35 | project <- "YOUR-PROJECT-ID"
 36 | #####################################################################
 37 | 
 38 | # The currently released version 0.3.0 does not yet have the parameter
 39 | # to use Standard SQL instead of Legacy SQL, so we install from github.
 40 | library(devtools)
 41 | install_github('rstats-db/bigrquery')
 42 | ```
 43 | 
 44 | ```{r init, echo=FALSE, message=FALSE, warning=FALSE, comment=NA}
 45 | library(bigrquery)
 46 | library(ggplot2)
 47 | library(scales)
 48 | library(dplyr)
 49 | ```
 50 | 
 51 | ```{r echo=FALSE, message=FALSE, warning=FALSE}
 52 | sample_info <- read.csv("http://storage.googleapis.com/genomics-public-data/1000-genomes/other/sample_info/sample_info.csv")
 53 | phase3 <- query_exec(
 54 |   "SELECT * FROM `google.com:biggene.1000genomes_analysis_results.phase3_metrics`",
 55 |   project, use_legacy_sql = FALSE)
 56 | phase1 <- query_exec(
 57 |   "SELECT * FROM `google.com:biggene.1000genomes_analysis_results.phase1_metrics`",
 58 |   project, use_legacy_sql = FALSE)
 59 | results <- inner_join(sample_info,
 60 |                       rbind_list(mutate(phase3, dataset = "phase3"),
 61 |                                  mutate(phase1, dataset = "phase1")),
 62 |                       by=c("Sample" = "call_call_set_name"))
 63 | ```
 64 | 
 65 | ```{r titv_metrics, echo=FALSE, message=FALSE, warning=FALSE, comment=NA, fig.align="center", fig.width=12, fig.height=8}
 66 | ggplot(results, aes(dataset, ti_tv_ratio)) +
 67 |   geom_boxplot(aes(colour = Super_Population)) +
 68 |   ylab("Ti/Tv Ratio") +
 69 |   xlab("Dataset") +
 70 |   ggtitle("Transition/Transversion SNP Ratio") 
 71 | 
 72 | ggplot(results, aes(dataset, ti_tv_ratio)) +
 73 |   geom_boxplot(aes(colour = Gender)) +
 74 |   ylab("Ti/Tv Ratio") +
 75 |   xlab("Dataset") +
 76 |   ggtitle("Transition/Transversion SNP Ratio") 
 77 | ```
 78 | 
 79 | ```{r hethom_metrics, echo=FALSE, message=FALSE, warning=FALSE, comment=NA, fig.align="center", fig.width=12, fig.height=8}
 80 | ggplot(results, aes(dataset, het_hom_ratio)) +
 81 |   geom_boxplot(aes(colour = Super_Population)) +
 82 |   ylab("Het/Hom Ratio") +
 83 |   xlab("Dataset") +
 84 |   ggtitle("Heterozygous/Homozygous Variant Ratio") 
 85 | 
 86 | ggplot(results, aes(dataset, het_hom_ratio)) +
 87 |   geom_boxplot(aes(colour = Gender)) +
 88 |   ylab("Het/Hom Ratio") +
 89 |   xlab("Dataset") +
 90 |   ggtitle("Heterozygous/Homozygous Variant Ratio") 
 91 | ```
 92 | 
 93 | ```{r indel_metrics, echo=FALSE, message=FALSE, warning=FALSE, comment=NA, fig.align="center", fig.width=12, fig.height=8}
 94 | ggplot(results, aes(dataset, ins_del_ratio)) +
 95 |   geom_boxplot(aes(colour = Super_Population)) +
 96 |   ylab("Indel Ratio") +
 97 |   xlab("Dataset") +
 98 |   ggtitle("Insertion/Deletion Ratio") 
 99 | 
100 | ggplot(results, aes(dataset, ins_del_ratio)) +
101 |   geom_boxplot(aes(colour = Gender)) +
102 |   ylab("Indel Ratio") +
103 |   xlab("Dataset") +
104 |   ggtitle("Insertion/Deletion Ratio") 
105 | ```
106 | 
107 | ### Analysis Provenance
108 | 
109 | The following query was run over 1,000 Genomes data:
110 | 
111 | 1. phase 3 variantsfor 2,504 individuals and materialized to table [google.com:biggene:1000genomes_analysis_results.phase3_metrics](https://bigquery.cloud.google.com/table/google.com:biggene:1000genomes_analysis_results.phase3_metrics?pli=1)
112 | 2. phase 1 variants for 1095 individuals and materialized to table [google.com:biggene:1000genomes_analysis_results.phase1_metrics](https://bigquery.cloud.google.com/table/google.com:biggene:1000genomes_analysis_results.phase1_metrics?pli=1)
113 |     * note that 1,092 individuals have variants across the entire genome
114 |     * two individuals have variants on chrM only
115 |     * one individual has variants on chrY only
116 | 
117 | Note that it was written to specifically handle the multi-allelic data found in phase 3.
118 | 
119 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, cache=FALSE}
120 | cat(readChar("./sql/qc-metrics.sql", nchars=1e6))
121 | ```
122 | 


--------------------------------------------------------------------------------
/1000genomes_phase3/sql/qc-metrics.sql:
--------------------------------------------------------------------------------
  1 | #standardSQL
  2 | --
  3 | -- Compute several common metrics on multi-allelic data.  This will work on data
  4 | -- in either "genome call" format or in "multi-sample variants" format since hom_RR_count
  5 | -- is not used in any of the ratios computed.
  6 | -- http://googlegenomics.readthedocs.io/en/latest/use_cases/load_data/multi_sample_variants.html
  7 | --
  8 | -- Edit the BigQuery table name below to run this query on other data such as 1,000
  9 | -- Genomes phase 1 variants.
 10 | -- http://googlegenomics.readthedocs.io/en/latest/use_cases/discover_public_data/1000_genomes.html
 11 | --
 12 | WITH calls AS (
 13 |   -- For multi-allelic data we FLATTEN by both alternate_bases and call which yields the
 14 |   -- cross product of those values.  We provide the alt_num value so that queries downstream
 15 |   -- of this can correctly identify calls with genotypes that match the particular alternate.
 16 |   SELECT
 17 |     reference_name,
 18 |     start,
 19 |     reference_bases,
 20 |     alt,
 21 |     alt_offset + 1 AS alt_num,  -- The number corresponding to the alternate_bases value.
 22 |     CONCAT(reference_bases, '->', alt) AS mutation,
 23 |     REGEXP_CONTAINS(alt, r'^[ACGT]+$') AS is_sequence,
 24 |     call.call_set_name,
 25 |     (SELECT LOGICAL_AND(gt = 0) FROM UNNEST(call.genotype) gt) AS reference_match_call,
 26 |     call.genotype[SAFE_ORDINAL(1)] AS first_allele,
 27 |     call.genotype[SAFE_ORDINAL(2)] AS second_allele
 28 |   FROM
 29 |     -- To run on phase 1 variants, update the following line to change the source table.
 30 |     `genomics-public-data.1000_genomes_phase_3.variants_20150220_release` v,
 31 |         v.call call, v.alternate_bases alt WITH OFFSET alt_offset
 32 |   -- Use this WHERE clause for fast testing of the query. Remove it for the full analysis.
 33 |   # WHERE
 34 |   #   reference_name IN ('chr17','17')
 35 |   #   AND start BETWEEN 41196311 AND 41277499 # per GRCh37
 36 | ),
 37 | 
 38 | compute_metrics AS (
 39 |   SELECT
 40 |     call_set_name,
 41 |     -- Anchor on alt_num=1 so that we don't over count hom_RR for multi-allelic sites.
 42 |     alt_num = 1 AND reference_match_call AS is_hom_RR,
 43 |     -- Otherwise, check whether the genotypes in the call match this alt_num.
 44 |     first_allele = alt_num AND (second_allele = alt_num OR second_allele IS NULL) AS is_hom_AA,
 45 |     (first_allele = 0 AND second_allele = alt_num) OR (first_allele = alt_num AND second_allele = 0) AS is_het_RA,
 46 |     (first_allele > 0 AND first_allele != alt_num AND second_allele = alt_num)
 47 |     OR (first_allele = alt_num AND second_allele > 0 AND second_allele != alt_num) AS is_het_AA,
 48 |     -- To prevent over counting of variant types due to the FLATTENED data, we make sure the genotype
 49 |     -- in the call corresponds to the the alt_num in this row and use this boolean in the downstream
 50 |     -- query.
 51 |     first_allele = alt_num OR second_allele = alt_num AS call_has_alternate_bases,
 52 |     NOT is_sequence AS is_sv,
 53 |     is_sequence AND LENGTH(reference_bases) = 1 AND LENGTH(alt) = 1 AS is_snp,
 54 |     is_sequence AND LENGTH(reference_bases) > 1 AND LENGTH(reference_bases) = LENGTH(alt) AS is_expanded_snp,
 55 |     is_sequence AND LENGTH(reference_bases) < LENGTH(alt) AS is_insertion,
 56 |     is_sequence AND LENGTH(reference_bases) > LENGTH(alt) AS is_deletion,
 57 |     mutation IN ('A->G','G->A','C->T','T->C') AS is_transition,
 58 |     mutation IN ('A->C','C->A','G->T','T->G','A->T','T->A','C->G','G->C') AS is_transversion
 59 |   FROM calls
 60 | ),
 61 | 
 62 | compute_sums AS (
 63 |   SELECT
 64 |     call_set_name,
 65 |     SUM(CAST(is_hom_RR AS INT64)) AS hom_RR_count,
 66 |     SUM(CAST(is_hom_AA AS INT64)) AS hom_AA_count,
 67 |     SUM(CAST(is_het_RA AS INT64)) AS het_RA_count,
 68 |     -- Divide by het_AA two since we have two rows for this sample's alleles because we
 69 |     -- FLATTENED by alternate_bases.
 70 |     SUM(CAST(is_het_AA AS INT64))/2 AS het_AA_count,
 71 |     SUM(CAST(call_has_alternate_bases AS INT64)) AS calls_has_alternate_bases_count,
 72 |     SUM(CAST(call_has_alternate_bases AND is_sv AS INT64)) AS sv_count,
 73 |     SUM(CAST(call_has_alternate_bases AND is_snp AS INT64)) AS snp_count,
 74 |     SUM(CAST(call_has_alternate_bases AND is_expanded_snp AS INT64)) AS expanded_snp_count,
 75 |     SUM(CAST(call_has_alternate_bases AND is_insertion AS INT64)) AS insertion_count,
 76 |     SUM(CAST(call_has_alternate_bases AND is_deletion AS INT64)) AS deletion_count,
 77 |     SUM(CAST(call_has_alternate_bases AND is_transition AS INT64)) AS transitions_count,
 78 |     SUM(CAST(call_has_alternate_bases AND is_transversion AS INT64)) AS transversions_count
 79 |   FROM compute_metrics
 80 |   GROUP BY
 81 |     call_set_name
 82 | )
 83 | 
 84 | SELECT
 85 |   call_set_name,
 86 |   -- Ratios.
 87 |   SAFE_DIVIDE(transitions_count, transversions_count) AS ti_tv_ratio,
 88 |   SAFE_DIVIDE((het_RA_count + 2 * het_AA_count), hom_AA_count) AS het_hom_ratio,
 89 |   SAFE_DIVIDE(insertion_count, deletion_count) AS ins_del_ratio,
 90 |   -- Call type counts.
 91 |   hom_RR_count,
 92 |   hom_AA_count,
 93 |   het_RA_count,
 94 |   het_AA_count,
 95 |   -- Alternate allele type counts.
 96 |   sv_count,
 97 |   snp_count,
 98 |   expanded_snp_count,
 99 |   insertion_count,
100 |   deletion_count,
101 |   -- SNP type counts.
102 |   transitions_count,
103 |   transversions_count,
104 |   -- Let's check our work for over/under counting.
105 |   calls_has_alternate_bases_count,
106 |   transitions_count + transversions_count AS check_snp_count,
107 |   sv_count + snp_count + expanded_snp_count + insertion_count + deletion_count AS check_calls_has_alternate_bases_count,
108 |   hom_RR_count + hom_AA_count + het_RA_count + het_AA_count AS check_total_num_calls
109 | FROM compute_sums
110 | ORDER BY
111 |   call_set_name
112 | 


--------------------------------------------------------------------------------
/1000genomes/data-stories/reproducing-allelic-frequencies/README.Rmd:
--------------------------------------------------------------------------------
  1 | <!-- R Markdown Documentation, DO NOT EDIT THE PLAIN MARKDOWN VERSION OF THIS FILE -->
  2 | 
  3 | <!-- Copyright 2014 Google Inc. All rights reserved. -->
  4 | 
  5 | <!-- Licensed under the Apache License, Version 2.0 (the "License"); -->
  6 | <!-- you may not use this file except in compliance with the License. -->
  7 | <!-- You may obtain a copy of the License at -->
  8 | 
  9 | <!--     http://www.apache.org/licenses/LICENSE-2.0 -->
 10 | 
 11 | <!-- Unless required by applicable law or agreed to in writing, software -->
 12 | <!-- distributed under the License is distributed on an "AS IS" BASIS, -->
 13 | <!-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -->
 14 | <!-- See the License for the specific language governing permissions and -->
 15 | <!-- limitations under the License. -->
 16 | 
 17 | Reproducing 1,000 Genomes allele frequencies for variants in BRCA1
 18 | ========================================================
 19 | 
 20 | The following query computes the frequency of both the reference and alternate SNPs within BRCA1 for all samples within 1,000 Genomes.
 21 | 
 22 | ```{r init, echo=FALSE, message=FALSE, warning=FALSE, comment=NA}
 23 | require(bigrquery)
 24 | require(ggplot2)
 25 | require(dplyr)
 26 | require(xtable)
 27 | require(testthat)
 28 | project <- "google.com:biggene" # put your projectID here
 29 | DisplayAndDispatchQuery <- function(queryUri) {
 30 |   querySql <- readChar(queryUri, nchars=1e6)
 31 |   cat(querySql)
 32 |   query_exec(querySql, project)
 33 | }
 34 | ```
 35 | 
 36 | ```{r af, echo=FALSE, message=FALSE, warning=FALSE, error=FALSE, comment=NA, cache=FALSE}
 37 | result <- DisplayAndDispatchQuery("../../sql/reproducing-allelic-frequencies/reproducing-allelic-frequency.sql")
 38 | ```
 39 | Number of rows returned by this query: `r nrow(result)`.
 40 | 
 41 | Displaying the first few rows of our result:
 42 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"}
 43 | print(xtable(head(result), digits=6), type="html", include.rownames=F)
 44 | ```
 45 | 
 46 | And do our results match the precomputed values resident in the AF INFO field?
 47 | ```{r}
 48 | print(expect_equal(object=result$alt_freq,
 49 |                    expected=result$alt_freq_from_1KG,
 50 |                    tolerance=0.005,
 51 |                    scale=1))
 52 | ```
 53 | We can see from the results that when the computed frequency values in column alt_freq are rounded, they exactly match the alternate allele frequencies as reported in the AF INFO field from the 1,000 Genomes VCF data.
 54 | 
 55 | Next, we compute those same alternate allele frequencies further broken down by super population groups.
 56 | ```{r afeth, echo=FALSE, message=FALSE, warning=FALSE, error=FALSE, comment=NA, cache=FALSE}
 57 | result <- DisplayAndDispatchQuery("../../sql/reproducing-allelic-frequencies/reproducing-allelic-frequency-by-ethnicity.sql")
 58 | ```
 59 | Number of rows returned by this query: `r nrow(result)`.
 60 | 
 61 | Displaying the first few rows of our result:
 62 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"}
 63 | print(xtable(head(result), digits=6), type="html", include.rownames=F)
 64 | ```
 65 | 
 66 | And do our results match the precomputed values resident in the superpopulation-specific AF INFO fields?
 67 | ```{r}
 68 | # coerce NAs to be zero
 69 | result$alt_freq_from_1KG[is.na(result$alt_freq_from_1KG)] <- 0.0
 70 | print(expect_equal(object=result$alt_freq,
 71 |                    expected=result$alt_freq_from_1KG,
 72 |                    tolerance=0.005,
 73 |                    scale=1))
 74 | ```
 75 | We can see from the results that when the computed frequency values in column alt_freq are rounded, they exactly match the alternate allele frequencies as reported in the AFR_AF, ASN_AF, AMR_AF, EUR_AF INFO fields from the 1,000 Genomes VCF data.
 76 | 
 77 | Moving onto other results regarding rates of variation across populations:
 78 | ```{r sql maf, echo=FALSE, message=FALSE, warning=FALSE, error=FALSE, comment=NA, cache=FALSE}
 79 | result <- DisplayAndDispatchQuery("../../sql/minimum-allelic-frequency-by-ethnicity.sql")
 80 | ```
 81 | Number of rows returned by this query: `r nrow(result)`.
 82 | 
 83 | Displaying the first few rows of our result:
 84 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"}
 85 | print(xtable(head(result), digits=6), type="html", include.rownames=F)
 86 | ```
 87 | 
 88 | Some data visualization will help us to see more clearly the pattern resident within the results:
 89 | ```{r maf, echo=FALSE, fig.align="center", fig.width=12, fig.height=8}
 90 | ggplot(result, aes(x=population, y=common_variant, fill=super_population)) + geom_boxplot() + ylab("Count of common variants per sample") + ggtitle("Common Variants (Minimum Allelic Frequency 5%)")
 91 | ```
 92 | and now its clear to see that the ethnicities within the African super population have a much higher rate of mutation compared to the other ethnicities for the common variants.
 93 | 
 94 | This difference is even more notable when looking at all variants:
 95 | ```{r all variants, echo=FALSE, fig.align="center", fig.width=12, fig.height=8}
 96 | ggplot(result, aes(x=population, y=num_variants_for_sample, fill=super_population)) + geom_boxplot() + ylab("Count variants per sample") + ggtitle("All Variants")
 97 | ```
 98 | 
 99 | Now lets examine the rate of variation across genders:
100 | ```{r common variants by gender, echo=FALSE, fig.align="center", fig.width=12, fig.height=8}
101 | ggplot(result, aes(x=super_population, y=common_variant, fill=gender)) + geom_boxplot() + ylab("Count of common variants per sample") + ggtitle("Common Variants (Minimum Allelic Frequency 5%)")
102 | ```
103 | We see a noticieable difference, BUT this query included variants within chromosome X.  Updating the query to ignore sex chromosomes:
104 | ```{r sql maf no X, echo=FALSE, message=FALSE, warning=FALSE, error=FALSE, comment=NA, cache=FALSE}
105 | result <- DisplayAndDispatchQuery("../../sql/minimum-allelic-frequency-by-ethnicity-no-sex-chromosomes.sql")
106 | ```
107 | We see that the genders are quite close in their rate of variation.
108 | ```{r viz maf no X, echo=FALSE, fig.align="center", fig.width=12, fig.height=8}
109 | ggplot(result, aes(x=super_population, y=common_variant, fill=gender)) + geom_boxplot() + ylab("Count of common variants per sample") + ggtitle("Common Variants (Minimum Allelic Frequency 5%)")
110 | ```
111 | 


--------------------------------------------------------------------------------