├── pgp ├── .gitignore ├── data-stories │ ├── schema-comparisons │ │ ├── .gitignore │ │ ├── figure │ │ │ ├── call_cnt.png │ │ │ ├── variant_cnt.png │ │ │ ├── sample_call_cnt.png │ │ │ └── sample_variant_cnt.png │ │ └── schema-comparison-observations.csv │ ├── comparing-pgp-to-1000genomes │ │ ├── .gitignore │ │ └── figure │ │ │ ├── variant counts-1.png │ │ │ ├── genotype heatmap-1.png │ │ │ ├── variant type counts-1.png │ │ │ └── pgp variant type counts-1.png │ ├── issues-with-the-variant-centric-approach │ │ └── .gitignore │ └── README.md ├── figure │ └── gender-1.png ├── sql │ ├── schema-comparisons │ │ ├── record-sample-counts.sql │ │ ├── missingness-klotho.sql │ │ ├── call-counts.sql │ │ ├── klotho-gvcf-expanded.sql │ │ ├── klotho-gvcf.sql │ │ ├── sample-call-counts.sql │ │ ├── missingness-brca1.sql │ │ └── missingness-udf.sql │ ├── comparing-pgp-to-1000genomes │ │ ├── genotype-counts.sql │ │ ├── sample-counts-minmax-by-chromosome.sql │ │ ├── variant-counts-by-chromosome.sql │ │ ├── parsed-genotype-counts.sql │ │ ├── taking-a-closer-look-at-variant-types.sql │ │ └── variant-counts-by-type-and-chromosome.sql │ ├── cgi_variants │ │ ├── klotho.sql │ │ ├── allelic-frequency-comparison.sql │ │ ├── allele-count.sql │ │ ├── ti-tv-ratio.sql │ │ ├── allelic-frequency.py │ │ ├── allelic-frequency-chr1.sql │ │ └── allelic-frequency-brca1.sql │ ├── gender-count.sql │ ├── issues-with-the-variant-centric-approach │ │ ├── klotho-summary.sql │ │ ├── factor-v-leiden.sql │ │ └── factor-v-leiden-summary.sql │ ├── gvcf_variants_expanded │ │ ├── klotho.sql │ │ ├── ti-tv-ratio.sql │ │ └── allelic-frequency.sql │ └── gvcf_variants │ │ ├── klotho.sql │ │ ├── allelic-frequency-comparison.sql │ │ ├── allelic-frequency.py │ │ ├── allele-count.sql │ │ ├── ti-tv-ratio.sql │ │ ├── allelic-frequency-brca1-no-udf.sql │ │ ├── allelic-frequency-chr1.sql │ │ └── allelic-frequency-brca1.sql ├── provenance │ ├── gvcf-expand-mapper.py │ ├── gvcf-expand-reducer.py │ ├── cgi-header-mapper.py │ ├── cgi-mapper.py │ └── cgi-ref-blocks-mapper.py ├── README.md └── README.Rmd ├── 1000genomes ├── .gitignore ├── data-stories │ ├── exploring-the-variant-data │ │ ├── .gitignore │ │ └── figure │ │ │ ├── unnamed-chunk-11-1.png │ │ │ ├── unnamed-chunk-13-1.png │ │ │ ├── unnamed-chunk-15-1.png │ │ │ ├── unnamed-chunk-16-1.png │ │ │ ├── unnamed-chunk-3-1.png │ │ │ ├── unnamed-chunk-5-1.png │ │ │ ├── unnamed-chunk-7-1.png │ │ │ └── unnamed-chunk-9-1.png │ ├── exploring-the-phenotypic-data │ │ ├── .gitignore │ │ └── figure │ │ │ ├── families.png │ │ │ ├── gender.png │ │ │ ├── samples.png │ │ │ ├── superpop.png │ │ │ ├── ethnicity.png │ │ │ └── ethnicity and gender.png │ ├── reproducing-hardy-weinberg-equilibrium │ │ ├── .gitignore │ │ └── README.Rmd │ ├── reproducing-vcfstats │ │ └── vcfstats-output │ │ │ ├── stats.qual-tstv │ │ │ ├── stats.tstv │ │ │ ├── stats.legend │ │ │ ├── stats.shared │ │ │ └── stats.private │ ├── reproducing-allelic-frequencies │ │ ├── figure │ │ │ ├── maf.png │ │ │ ├── all variants.png │ │ │ ├── viz maf no X.png │ │ │ └── common variants by gender.png │ │ └── README.Rmd │ └── README.md ├── figure │ ├── dbSNP Variants-1.png │ ├── shared Variants-1.png │ ├── shared variants by pop-1.png │ ├── shared common variants by pop-1.png │ ├── shared rare variants by pop-1.png │ ├── shared rare variants by percent pop-1.png │ └── shared common variants by percent pop-1.png ├── sql │ ├── variant-counts-by-type.sql │ ├── phenotype_sql │ │ ├── num-samples.sql │ │ ├── family-sizes.sql │ │ ├── gender-ratio.sql │ │ ├── ethnicity-by-gender-ratio.sql │ │ ├── ethnicity-by-superpop-ratio.sql │ │ └── ethnicity-ratio.sql │ ├── reproducing-vcfstats │ │ ├── variant-count-brca1.sql │ │ ├── variant-counts-by-type-brca1.sql │ │ ├── snp-variant-counts-brca1.sql │ │ ├── sample-snp-counts-brca1.sql │ │ ├── sample-indel-counts-brca1.sql │ │ ├── shared-variant-counts-brca1.sql │ │ ├── indel-length-counts-brca1.sql │ │ ├── private-variant-counts-brca1.sql │ │ ├── variant-sample-counts-brca1.sql │ │ └── ti-tv-ratio-brca1.sql │ ├── variant-counts-by-type-and-chromosome.sql │ ├── snp-variant-counts.sql │ ├── understanding-alternate-alleles │ │ ├── chrom-pos-ref-dups.sql │ │ ├── minimal-unique-key.sql │ │ ├── three-chrom-pos-ref-dups.sql │ │ ├── count-chrom-pos-ref.sql │ │ ├── unique-key.sql │ │ ├── not-quite-unique-key.sql │ │ ├── sample-likelihood.sql │ │ ├── count-by-var-type-chrom-pos-ref-dups.sql │ │ ├── count-by-var-type-chrom-pos-ref-singles.sql │ │ └── sample-chrom-pos-ref-dups.sql │ ├── ratio-of-variants-by-type.sql │ ├── variant-level-data-for-brca1.sql │ ├── ratio-of-dbsnp-variants-by-chromosome.sql │ ├── indel-length-counts.sql │ ├── private-variant-counts.sql │ ├── shared-variant-counts.sql │ ├── sample-variant-counts-by-type-and-chromosome.sql │ ├── sample-level-data-for-brca1.sql │ ├── ti-tv-ratio.sql │ ├── heterozygous-homozygous-ratio.sql │ ├── minimum-allelic-frequency-by-ethnicity.sql │ ├── variant-hotspots.sql │ ├── sample-variant-hotspots.sql │ ├── minimum-allelic-frequency-by-ethnicity-no-sex-chromosomes.sql │ ├── allelic-frequency.sql │ ├── reproducing-allelic-frequencies │ │ ├── reproducing-allelic-frequency.sql │ │ └── reproducing-allelic-frequency-by-ethnicity.sql │ ├── allelic-frequency-by-gender.sql │ ├── allelic-frequency-by-ethnicity.sql │ ├── gender-het-hom-ratio.sql │ ├── shared-variant-counts-by-ethnicity.sql │ ├── gwas-pattern-two-proportion-z-test.sql │ ├── gwas-pattern-chi-squared-test.sql │ └── hardy-weinberg-equilibrium.sql └── provenance │ └── README.md ├── .gitignore ├── platinumGenomes ├── figure │ └── function-1.png ├── sql │ ├── sample-snps-by-exonic-function.sql │ ├── cohort-rare-pathenogenic-snps.sql │ └── sample-rare-pathenogenic-snps.sql └── README.Rmd ├── 1000genomes_phase3 ├── figure │ ├── titv_metrics-1.png │ ├── titv_metrics-2.png │ ├── hethom_metrics-1.png │ ├── hethom_metrics-2.png │ ├── indel_metrics-1.png │ └── indel_metrics-2.png ├── README.Rmd └── sql │ └── qc-metrics.sql ├── annotations └── README.md ├── README.md ├── CONTRIBUTING.rst └── sgdp └── provenance └── wrangle-simons-sample-attributes.R /pgp/.gitignore: -------------------------------------------------------------------------------- 1 | .httr-oauth 2 | -------------------------------------------------------------------------------- /1000genomes/.gitignore: -------------------------------------------------------------------------------- 1 | .httr-oauth 2 | -------------------------------------------------------------------------------- /pgp/data-stories/schema-comparisons/.gitignore: -------------------------------------------------------------------------------- 1 | .httr-oauth 2 | -------------------------------------------------------------------------------- /pgp/data-stories/comparing-pgp-to-1000genomes/.gitignore: -------------------------------------------------------------------------------- 1 | .httr-oauth 2 | -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-variant-data/.gitignore: -------------------------------------------------------------------------------- 1 | .httr-oauth 2 | -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-phenotypic-data/.gitignore: -------------------------------------------------------------------------------- 1 | .httr-oauth 2 | -------------------------------------------------------------------------------- /pgp/data-stories/issues-with-the-variant-centric-approach/.gitignore: -------------------------------------------------------------------------------- 1 | .httr-oauth 2 | -------------------------------------------------------------------------------- /1000genomes/data-stories/reproducing-hardy-weinberg-equilibrium/.gitignore: -------------------------------------------------------------------------------- 1 | .httr-oauth 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | cache 2 | *.Rproj 3 | .Rproj.user 4 | .Rhistory 5 | .RData 6 | *html 7 | .httr-oauth 8 | -------------------------------------------------------------------------------- /pgp/figure/gender-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/figure/gender-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/reproducing-vcfstats/vcfstats-output/stats.qual-tstv: -------------------------------------------------------------------------------- 1 | #Quality Marginal count Marginal Ts/Tv 2 | -------------------------------------------------------------------------------- /1000genomes/data-stories/reproducing-vcfstats/vcfstats-output/stats.tstv: -------------------------------------------------------------------------------- 1 | #Transitions Transversions ts/tv Sample 2 | 615 228 2.70 all 3 | -------------------------------------------------------------------------------- /1000genomes/figure/dbSNP Variants-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/figure/dbSNP Variants-1.png -------------------------------------------------------------------------------- /platinumGenomes/figure/function-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/platinumGenomes/figure/function-1.png -------------------------------------------------------------------------------- /1000genomes/figure/shared Variants-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/figure/shared Variants-1.png -------------------------------------------------------------------------------- /1000genomes_phase3/figure/titv_metrics-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes_phase3/figure/titv_metrics-1.png -------------------------------------------------------------------------------- /1000genomes_phase3/figure/titv_metrics-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes_phase3/figure/titv_metrics-2.png -------------------------------------------------------------------------------- /1000genomes_phase3/figure/hethom_metrics-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes_phase3/figure/hethom_metrics-1.png -------------------------------------------------------------------------------- /1000genomes_phase3/figure/hethom_metrics-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes_phase3/figure/hethom_metrics-2.png -------------------------------------------------------------------------------- /1000genomes_phase3/figure/indel_metrics-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes_phase3/figure/indel_metrics-1.png -------------------------------------------------------------------------------- /1000genomes_phase3/figure/indel_metrics-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes_phase3/figure/indel_metrics-2.png -------------------------------------------------------------------------------- /1000genomes/figure/shared variants by pop-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/figure/shared variants by pop-1.png -------------------------------------------------------------------------------- /1000genomes/figure/shared common variants by pop-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/figure/shared common variants by pop-1.png -------------------------------------------------------------------------------- /1000genomes/figure/shared rare variants by pop-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/figure/shared rare variants by pop-1.png -------------------------------------------------------------------------------- /pgp/data-stories/schema-comparisons/figure/call_cnt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/schema-comparisons/figure/call_cnt.png -------------------------------------------------------------------------------- /pgp/data-stories/schema-comparisons/figure/variant_cnt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/schema-comparisons/figure/variant_cnt.png -------------------------------------------------------------------------------- /1000genomes/figure/shared rare variants by percent pop-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/figure/shared rare variants by percent pop-1.png -------------------------------------------------------------------------------- /1000genomes/figure/shared common variants by percent pop-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/figure/shared common variants by percent pop-1.png -------------------------------------------------------------------------------- /pgp/data-stories/schema-comparisons/figure/sample_call_cnt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/schema-comparisons/figure/sample_call_cnt.png -------------------------------------------------------------------------------- /pgp/data-stories/schema-comparisons/figure/sample_variant_cnt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/schema-comparisons/figure/sample_variant_cnt.png -------------------------------------------------------------------------------- /1000genomes/data-stories/reproducing-allelic-frequencies/figure/maf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/reproducing-allelic-frequencies/figure/maf.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-phenotypic-data/figure/families.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-phenotypic-data/figure/families.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-phenotypic-data/figure/gender.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-phenotypic-data/figure/gender.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-phenotypic-data/figure/samples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-phenotypic-data/figure/samples.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-phenotypic-data/figure/superpop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-phenotypic-data/figure/superpop.png -------------------------------------------------------------------------------- /pgp/data-stories/comparing-pgp-to-1000genomes/figure/variant counts-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/comparing-pgp-to-1000genomes/figure/variant counts-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-phenotypic-data/figure/ethnicity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-phenotypic-data/figure/ethnicity.png -------------------------------------------------------------------------------- /pgp/data-stories/comparing-pgp-to-1000genomes/figure/genotype heatmap-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/comparing-pgp-to-1000genomes/figure/genotype heatmap-1.png -------------------------------------------------------------------------------- /pgp/data-stories/comparing-pgp-to-1000genomes/figure/variant type counts-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/comparing-pgp-to-1000genomes/figure/variant type counts-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-11-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-11-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-13-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-13-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-15-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-15-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-16-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-16-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-7-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-variant-data/figure/unnamed-chunk-9-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/reproducing-allelic-frequencies/figure/all variants.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/reproducing-allelic-frequencies/figure/all variants.png -------------------------------------------------------------------------------- /1000genomes/data-stories/reproducing-allelic-frequencies/figure/viz maf no X.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/reproducing-allelic-frequencies/figure/viz maf no X.png -------------------------------------------------------------------------------- /pgp/data-stories/comparing-pgp-to-1000genomes/figure/pgp variant type counts-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/pgp/data-stories/comparing-pgp-to-1000genomes/figure/pgp variant type counts-1.png -------------------------------------------------------------------------------- /1000genomes/data-stories/exploring-the-phenotypic-data/figure/ethnicity and gender.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/exploring-the-phenotypic-data/figure/ethnicity and gender.png -------------------------------------------------------------------------------- /1000genomes/data-stories/reproducing-allelic-frequencies/figure/common variants by gender.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/googlegenomics/bigquery-examples/master/1000genomes/data-stories/reproducing-allelic-frequencies/figure/common variants by gender.png -------------------------------------------------------------------------------- /1000genomes/sql/variant-counts-by-type.sql: -------------------------------------------------------------------------------- 1 | # Count the number of variants across the entirety of 1,000 Genomes by variant type. 2 | SELECT 3 | vt, 4 | COUNT(vt) as cnt, 5 | FROM 6 | [genomics-public-data:1000_genomes.variants] 7 | GROUP BY 8 | vt 9 | ORDER BY 10 | vt 11 | -------------------------------------------------------------------------------- /1000genomes/sql/phenotype_sql/num-samples.sql: -------------------------------------------------------------------------------- 1 | # Count the number of samples in the phenotypic data 2 | SELECT 3 | COUNT(sample) AS all_samples, 4 | SUM(IF(In_Phase1_Integrated_Variant_Set = TRUE, 1, 0)) AS samples_in_variants_table 5 | FROM 6 | [genomics-public-data:1000_genomes.sample_info] 7 | -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-vcfstats/variant-count-brca1.sql: -------------------------------------------------------------------------------- 1 | # Count the number of variants in BRCA1 2 | SELECT 3 | count(reference_name) as num_variants, 4 | FROM 5 | [genomics-public-data:1000_genomes.variants] 6 | WHERE 7 | reference_name = '17' 8 | AND start BETWEEN 41196311 9 | AND 41277499 -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-vcfstats/variant-counts-by-type-brca1.sql: -------------------------------------------------------------------------------- 1 | # Count the number of variants by type in BRCA1. 2 | SELECT 3 | vt AS variant_type, 4 | COUNT(vt) AS num_variants_of_type, 5 | FROM 6 | [genomics-public-data:1000_genomes.variants] 7 | WHERE 8 | reference_name = '17' 9 | AND start BETWEEN 41196311 10 | AND 41277499 11 | GROUP BY 12 | variant_type -------------------------------------------------------------------------------- /pgp/sql/schema-comparisons/record-sample-counts.sql: -------------------------------------------------------------------------------- 1 | # Confirm that we are correctly expanding reference-matching blocks into our variants. 2 | SELECT 3 | MAX(num_sample_ids) as max_samples_per_record, 4 | FROM ( 5 | SELECT 6 | COUNT(call.callset_name) WITHIN RECORD AS num_sample_ids, 7 | FROM 8 | [google.com:biggene:test.pgp_gvcf_variants_expanded2] 9 | ) 10 | -------------------------------------------------------------------------------- /1000genomes/sql/variant-counts-by-type-and-chromosome.sql: -------------------------------------------------------------------------------- 1 | # Count the number of variants across the entirety of 1,000 Genomes by variant type and 2 | # chromosome. 3 | SELECT 4 | reference_name, 5 | vt, 6 | COUNT(vt) AS cnt, 7 | FROM 8 | [genomics-public-data:1000_genomes.variants] 9 | GROUP BY 10 | reference_name, 11 | vt 12 | ORDER BY 13 | reference_name, 14 | vt 15 | -------------------------------------------------------------------------------- /1000genomes/sql/snp-variant-counts.sql: -------------------------------------------------------------------------------- 1 | # Count SNPs by base pair transition across the dataset 2 | SELECT 3 | reference_bases, 4 | alternate_bases AS allele, 5 | COUNT(alternate_bases) AS num_snps 6 | FROM 7 | [genomics-public-data:1000_genomes.variants] 8 | WHERE 9 | vt ='SNP' 10 | GROUP BY 11 | reference_bases, 12 | allele 13 | ORDER BY 14 | reference_bases, 15 | allele 16 | -------------------------------------------------------------------------------- /pgp/sql/comparing-pgp-to-1000genomes/genotype-counts.sql: -------------------------------------------------------------------------------- 1 | # Count the number of genotypes for all individuals in the dataset. 2 | SELECT 3 | genotype, 4 | COUNT(genotype) AS cnt, 5 | FROM ( 6 | SELECT 7 | GROUP_CONCAT(STRING(call.genotype)) WITHIN call AS genotype, 8 | FROM 9 | [google.com:biggene:pgp_20150205.genome_calls]) 10 | GROUP BY 11 | genotype 12 | ORDER BY 13 | cnt DESC 14 | -------------------------------------------------------------------------------- /1000genomes/data-stories/reproducing-vcfstats/vcfstats-output/stats.legend: -------------------------------------------------------------------------------- 1 | 2 | count 3 | Number of positions with known genotype 4 | 5 | nalt_X 6 | Number of monoallelic (X=0), biallelic (X=1), etc. sites 7 | 8 | ref, ref_count 9 | Number of sites containing reference allele 10 | 11 | shared 12 | Number of sites having a non-reference allele in 0,1,2,etc samples 13 | 14 | snp_count 15 | Number of positions with SNPs 16 | -------------------------------------------------------------------------------- /1000genomes/sql/phenotype_sql/family-sizes.sql: -------------------------------------------------------------------------------- 1 | # Compute the distribution of family sizes 2 | SELECT 3 | num_family_members AS family_size, 4 | COUNT(num_family_members) AS num_families_of_size 5 | FROM ( 6 | SELECT 7 | family_id, 8 | COUNT(family_id) AS num_family_members, 9 | FROM 10 | [genomics-public-data:1000_genomes.sample_info] 11 | WHERE 12 | In_Phase1_Integrated_Variant_Set = TRUE 13 | GROUP BY 14 | family_id) 15 | GROUP BY 16 | family_size -------------------------------------------------------------------------------- /1000genomes/sql/phenotype_sql/gender-ratio.sql: -------------------------------------------------------------------------------- 1 | # Compute sample count and ratio by gender 2 | SELECT 3 | gender, 4 | gender_count, 5 | RATIO_TO_REPORT(gender_count) 6 | OVER 7 | ( 8 | ORDER BY 9 | gender_count) AS gender_ratio 10 | FROM ( 11 | SELECT 12 | gender, 13 | COUNT(gender) AS gender_count, 14 | FROM 15 | [genomics-public-data:1000_genomes.sample_info] 16 | WHERE 17 | In_Phase1_Integrated_Variant_Set = TRUE 18 | GROUP BY 19 | gender) -------------------------------------------------------------------------------- /pgp/sql/cgi_variants/klotho.sql: -------------------------------------------------------------------------------- 1 | # Sample level data for Klotho variant rs9536314 for use in the "amazing 2 | # intelligence of PGP participants" data story. 3 | SELECT 4 | sample_id, 5 | chromosome, 6 | locusBegin, 7 | locusEnd, 8 | reference, 9 | allele1Seq, 10 | allele2Seq, 11 | FROM 12 | [google.com:biggene:pgp.cgi_variants] 13 | WHERE 14 | chromosome = "chr13" 15 | AND locusBegin <= 33628137 16 | AND locusEnd >= 33628138 17 | ORDER BY 18 | sample_id 19 | -------------------------------------------------------------------------------- /pgp/sql/gender-count.sql: -------------------------------------------------------------------------------- 1 | # Compute sample count by gender 2 | SELECT 3 | Sex_Gender, 4 | COUNT(1) AS cnt 5 | FROM 6 | ( 7 | SELECT 8 | call.callset_name, 9 | Sex_Gender 10 | FROM 11 | FLATTEN([google.com:biggene:pgp.variants], 12 | call) AS var 13 | JOIN 14 | [google.com:biggene:pgp.phenotypes] AS pheno 15 | ON 16 | pheno.Participant = var.call.callset_name 17 | GROUP BY 18 | call.callset_name, 19 | Sex_Gender) 20 | GROUP BY 21 | Sex_Gender -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-vcfstats/snp-variant-counts-brca1.sql: -------------------------------------------------------------------------------- 1 | # Count SNPs by base pair transition across BRCA1. 2 | SELECT 3 | reference_bases, 4 | alternate_bases AS allele, 5 | COUNT(alternate_bases) AS num_snps 6 | FROM 7 | [genomics-public-data:1000_genomes.variants] 8 | WHERE 9 | reference_name = '17' 10 | AND start BETWEEN 41196311 11 | AND 41277499 12 | AND vt ='SNP' 13 | GROUP BY 14 | reference_bases, 15 | allele 16 | ORDER BY 17 | reference_bases, 18 | allele -------------------------------------------------------------------------------- /1000genomes/sql/understanding-alternate-alleles/chrom-pos-ref-dups.sql: -------------------------------------------------------------------------------- 1 | # Find variants on chromosome 17 that reside on the same start with the same reference base 2 | SELECT 3 | reference_name, 4 | start, 5 | reference_bases, 6 | COUNT(start) AS num_alternates 7 | FROM 8 | [genomics-public-data:1000_genomes.variants] 9 | WHERE 10 | reference_name = '17' 11 | GROUP BY 12 | reference_name, 13 | start, 14 | reference_bases 15 | HAVING 16 | num_alternates > 1 17 | ORDER BY 18 | reference_name, 19 | start, 20 | reference_bases -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-vcfstats/sample-snp-counts-brca1.sql: -------------------------------------------------------------------------------- 1 | # Sample SNP counts for BRCA1. 2 | SELECT 3 | COUNT(sample_id) AS variant_count, 4 | sample_id 5 | FROM ( 6 | SELECT 7 | reference_name, 8 | start, 9 | reference_bases, 10 | call.call_set_name AS sample_id 11 | FROM 12 | [genomics-public-data:1000_genomes.variants] 13 | WHERE 14 | reference_name = '17' 15 | AND start BETWEEN 41196311 16 | AND 41277499 17 | AND vt ='SNP' 18 | AND (0 < call.genotype) 19 | ) 20 | GROUP BY 21 | sample_id 22 | ORDER BY 23 | sample_id 24 | -------------------------------------------------------------------------------- /1000genomes/sql/understanding-alternate-alleles/minimal-unique-key.sql: -------------------------------------------------------------------------------- 1 | # This query demonstrates the minimal set of fields needed to 2 | # comprise a unique key for the rows in the table. 3 | SELECT 4 | reference_name, 5 | start, 6 | alt, 7 | end, 8 | COUNT(1) AS cnt 9 | FROM ( 10 | SELECT 11 | reference_name, 12 | start, 13 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 14 | end, 15 | FROM 16 | [genomics-public-data:1000_genomes.variants]) 17 | GROUP EACH BY 18 | reference_name, 19 | start, 20 | alt, 21 | end 22 | HAVING 23 | cnt > 1 -------------------------------------------------------------------------------- /1000genomes/sql/understanding-alternate-alleles/three-chrom-pos-ref-dups.sql: -------------------------------------------------------------------------------- 1 | # Get three particular start on chromosome 17 that have alternate variants. 2 | SELECT 3 | reference_name, 4 | start, 5 | reference_bases, 6 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 7 | GROUP_CONCAT(names) WITHIN RECORD AS names, 8 | vt, 9 | FROM 10 | [genomics-public-data:1000_genomes.variants] 11 | WHERE 12 | reference_name = '17' 13 | AND (start = 48515942 14 | OR start = 48570613 15 | OR start = 48659342) 16 | ORDER BY 17 | start, 18 | reference_bases, 19 | alt 20 | -------------------------------------------------------------------------------- /pgp/sql/issues-with-the-variant-centric-approach/klotho-summary.sql: -------------------------------------------------------------------------------- 1 | # Sample counts for Klotho variant rs9536314 for use in the "amazing 2 | # intelligence of PGP participants" data story. 3 | SELECT 4 | COUNT(sample_id) AS sample_counts, 5 | chromosome, 6 | reference, 7 | allele1Seq, 8 | allele2Seq, 9 | FROM 10 | [google.com:biggene:pgp.cgi_variants] 11 | WHERE 12 | chromosome = "chr13" 13 | AND locusBegin <= 33628137 14 | AND locusEnd >= 33628138 15 | GROUP BY 16 | chromosome, 17 | reference, 18 | allele1Seq, 19 | allele2Seq 20 | ORDER BY 21 | sample_counts DESC 22 | -------------------------------------------------------------------------------- /1000genomes/sql/understanding-alternate-alleles/count-chrom-pos-ref.sql: -------------------------------------------------------------------------------- 1 | # Count number of alternate variants on chromosome 17 for the same start and 2 | # reference base 3 | SELECT 4 | num_alternates, 5 | COUNT(num_alternates) AS num_records 6 | FROM ( 7 | SELECT 8 | reference_name, 9 | start, 10 | reference_bases, 11 | COUNT(start) AS num_alternates, 12 | FROM 13 | [genomics-public-data:1000_genomes.variants] 14 | WHERE 15 | reference_name = '17' 16 | GROUP BY 17 | reference_name, 18 | start, 19 | reference_bases) 20 | GROUP BY 21 | num_alternates 22 | -------------------------------------------------------------------------------- /1000genomes/sql/ratio-of-variants-by-type.sql: -------------------------------------------------------------------------------- 1 | # Compute the ratios of variants by type for each chromosome. 2 | SELECT 3 | reference_name, 4 | vt AS variant_type, 5 | RATIO_TO_REPORT(variant_count) 6 | OVER 7 | ( 8 | PARTITION BY 9 | reference_name 10 | ORDER BY 11 | variant_count DESC) ratio_of_variants_of_type_for_reference_name, 12 | FROM ( 13 | SELECT 14 | reference_name, 15 | vt, 16 | COUNT(vt) AS variant_count 17 | FROM 18 | [genomics-public-data:1000_genomes.variants] 19 | GROUP BY 20 | reference_name, 21 | vt 22 | ORDER BY 23 | reference_name, 24 | vt) 25 | -------------------------------------------------------------------------------- /1000genomes/sql/phenotype_sql/ethnicity-by-gender-ratio.sql: -------------------------------------------------------------------------------- 1 | # Ratios of ethnicities grouped by gender 2 | SELECT 3 | population, 4 | gender, 5 | population_count, 6 | RATIO_TO_REPORT(population_count) OVER( 7 | PARTITION BY 8 | population 9 | ORDER BY 10 | gender) 11 | AS population_ratio 12 | from( 13 | SELECT 14 | gender, 15 | population, 16 | COUNT(population) AS population_count, 17 | FROM 18 | [genomics-public-data:1000_genomes.sample_info] 19 | WHERE 20 | In_Phase1_Integrated_Variant_Set = TRUE 21 | GROUP BY 22 | gender, 23 | population) 24 | ORDER BY 25 | population, 26 | gender 27 | -------------------------------------------------------------------------------- /1000genomes/sql/variant-level-data-for-brca1.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | -- 3 | -- Retrieve variant-level information for BRCA1 variants. 4 | -- 5 | SELECT 6 | reference_name, 7 | start, 8 | `end`, 9 | reference_bases, 10 | ARRAY_TO_STRING(v.alternate_bases, ',') AS alts, 11 | quality, 12 | ARRAY_TO_STRING(v.filter, ',') AS filter, 13 | ARRAY_TO_STRING(v.names, ',') AS names, 14 | vt, 15 | ARRAY_LENGTH(v.call) AS num_samples 16 | FROM 17 | `genomics-public-data.1000_genomes.variants` v 18 | WHERE 19 | reference_name IN ('17', 'chr17') 20 | AND start BETWEEN 41196311 AND 41277499 # per GRCh37 21 | ORDER BY 22 | start, 23 | alts 24 | -------------------------------------------------------------------------------- /pgp/sql/comparing-pgp-to-1000genomes/sample-counts-minmax-by-chromosome.sql: -------------------------------------------------------------------------------- 1 | # Summarize the minimum and maximum number of samples per variant by chromosome. 2 | SELECT 3 | reference_name, 4 | MIN(sample_count) AS minimum_sample_count, 5 | MAX(sample_count) AS maximum_sample_count, 6 | FROM ( 7 | SELECT 8 | reference_name, 9 | COUNT(call.call_set_name) WITHIN RECORD AS sample_count 10 | FROM 11 | [google.com:biggene:pgp_20150205.genome_calls] 12 | # The source data was Complete Genomics which includes non-variant segments. 13 | OMIT RECORD IF EVERY(alternate_bases IS NULL)) 14 | GROUP BY 15 | reference_name 16 | ORDER BY 17 | reference_name 18 | -------------------------------------------------------------------------------- /1000genomes/sql/phenotype_sql/ethnicity-by-superpop-ratio.sql: -------------------------------------------------------------------------------- 1 | # Ratios of ethnicities grouped by super population 2 | SELECT 3 | super_population, 4 | super_population_description, 5 | super_population_count, 6 | RATIO_TO_REPORT(super_population_count) 7 | OVER 8 | ( 9 | ORDER BY 10 | super_population_count) AS super_population_ratio 11 | from( 12 | SELECT 13 | super_population, 14 | super_population_description, 15 | COUNT(population) AS super_population_count, 16 | FROM 17 | [genomics-public-data:1000_genomes.sample_info] 18 | WHERE 19 | In_Phase1_Integrated_Variant_Set = TRUE 20 | GROUP BY 21 | super_population, 22 | super_population_description) -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-vcfstats/sample-indel-counts-brca1.sql: -------------------------------------------------------------------------------- 1 | # Sample INDEL counts for BRCA1. 2 | SELECT 3 | COUNT(sample_id) AS variant_count, 4 | sample_id, 5 | FROM ( 6 | SELECT 7 | call.call_set_name AS sample_id, 8 | NTH(1, 9 | call.genotype) WITHIN call AS first_allele, 10 | NTH(2, 11 | call.genotype) WITHIN call AS second_allele, 12 | FROM 13 | [genomics-public-data:1000_genomes.variants] 14 | WHERE 15 | reference_name = '17' 16 | AND start BETWEEN 41196311 17 | AND 41277499 18 | AND vt ='INDEL' 19 | HAVING 20 | 0 < first_allele 21 | OR 0 < second_allele) 22 | GROUP BY 23 | sample_id 24 | ORDER BY 25 | sample_id 26 | -------------------------------------------------------------------------------- /1000genomes/sql/understanding-alternate-alleles/unique-key.sql: -------------------------------------------------------------------------------- 1 | # This query demonstrates that an additional field, 'end', is needed to 2 | # comprise a unique key for the rows in the table. 3 | SELECT 4 | reference_name, 5 | start, 6 | reference_bases, 7 | alt, 8 | vt, 9 | end, 10 | COUNT(1) AS cnt 11 | FROM ( 12 | SELECT 13 | reference_name, 14 | start, 15 | reference_bases, 16 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 17 | vt, 18 | end, 19 | FROM 20 | [genomics-public-data:1000_genomes.variants]) 21 | GROUP EACH BY 22 | reference_name, 23 | start, 24 | reference_bases, 25 | alt, 26 | vt, 27 | end 28 | HAVING 29 | cnt > 1 -------------------------------------------------------------------------------- /1000genomes/sql/understanding-alternate-alleles/not-quite-unique-key.sql: -------------------------------------------------------------------------------- 1 | # This query demonstrates that some additional field is needed to 2 | # comprise a unique key for the rows in the table. 3 | SELECT 4 | reference_name, 5 | start, 6 | reference_bases, 7 | alt, 8 | vt, 9 | COUNT(1) AS cnt 10 | FROM ( 11 | SELECT 12 | reference_name, 13 | start, 14 | reference_bases, 15 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 16 | vt, 17 | FROM 18 | [genomics-public-data:1000_genomes.variants]) 19 | GROUP EACH BY 20 | reference_name, 21 | start, 22 | reference_bases, 23 | alt, 24 | vt 25 | HAVING 26 | cnt > 1 27 | ORDER BY 28 | reference_name 29 | -------------------------------------------------------------------------------- /1000genomes/sql/ratio-of-dbsnp-variants-by-chromosome.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | -- 3 | -- Get the proportion of variants (per chromosome) in the dataset 4 | -- that have been reported in the dbSNP database (version 132). 5 | -- 6 | WITH 7 | counts AS ( 8 | SELECT 9 | reference_name, 10 | COUNT(1) AS num_variants, 11 | COUNTIF(ARRAY_LENGTH(names) > 0) AS num_dbsnp_variants 12 | FROM 13 | `genomics-public-data.1000_genomes.variants` 14 | GROUP BY 15 | reference_name ) 16 | -- 17 | -- Compute the ratio. 18 | SELECT 19 | reference_name, 20 | num_dbsnp_variants, 21 | num_variants, 22 | num_dbsnp_variants / num_variants AS frequency 23 | FROM 24 | counts 25 | ORDER BY 26 | num_variants DESC 27 | -------------------------------------------------------------------------------- /1000genomes/sql/indel-length-counts.sql: -------------------------------------------------------------------------------- 1 | # Count the number of INDELs differing from the reference allele by particular lengths 2 | SELECT 3 | length_difference, 4 | COUNT(length_difference) AS count_of_indels_with_length_difference, 5 | FROM ( 6 | SELECT 7 | reference_name, 8 | start, 9 | reference_bases, 10 | LENGTH(reference_bases) AS ref_length, 11 | alternate_bases AS allele, 12 | LENGTH(alternate_bases) AS allele_length, 13 | (LENGTH(alternate_bases) - LENGTH(reference_bases)) AS length_difference, 14 | FROM 15 | [genomics-public-data:1000_genomes.variants] 16 | WHERE 17 | vt ='INDEL' 18 | ) 19 | GROUP BY 20 | length_difference 21 | ORDER BY 22 | length_difference 23 | -------------------------------------------------------------------------------- /1000genomes/sql/private-variant-counts.sql: -------------------------------------------------------------------------------- 1 | # Compute the number of variants for a particular sample that are shared by 2 | # no other samples. 3 | SELECT 4 | COUNT(sample_id) AS private_variants_count, 5 | sample_id 6 | FROM 7 | ( 8 | SELECT 9 | reference_name, 10 | start, 11 | reference_bases, 12 | IF(0 < call.genotype, 13 | call.call_set_name, 14 | NULL) AS sample_id, 15 | SUM(IF(0 < call.genotype, 16 | 1, 17 | 0)) WITHIN RECORD AS num_samples_with_variant 18 | FROM 19 | [genomics-public-data:1000_genomes.variants] 20 | HAVING 21 | num_samples_with_variant = 1 22 | AND sample_id IS NOT NULL) 23 | GROUP EACH BY 24 | sample_id 25 | ORDER BY 26 | sample_id 27 | -------------------------------------------------------------------------------- /annotations/README.md: -------------------------------------------------------------------------------- 1 | Annotations 2 | ============ 3 | 4 | Tute Genomics has provided a table of annotations for hg19 SNPs. 5 | 6 | * For example queries, see the [Platinum Genomes Annotation JOINs](../platinumGenomes) 7 | data story for a few examples of how these tables can be used with variant data. 8 | * Please see [Google Genomics Public Data](http://googlegenomics.readthedocs.io/en/latest/use_cases/discover_public_data/tute_genomics_public_data.html) 9 | for more detail. 10 | 11 | A handful of other annotation databases have been loaded to BigQuery for 12 | prototyping purposes. See [provenance](./provenance) for details on the 13 | source of this data and how it may have been transformed prior to loading 14 | to BigQuery. 15 | -------------------------------------------------------------------------------- /pgp/sql/issues-with-the-variant-centric-approach/factor-v-leiden.sql: -------------------------------------------------------------------------------- 1 | # Sample level data for rs6025 and hereditary thrombophilia trait 2 | # for use in the Factor V Leiden data story. 3 | SELECT 4 | sample_id, 5 | chromosome, 6 | locusBegin, 7 | locusEnd, 8 | reference, 9 | allele1Seq, 10 | allele2Seq, 11 | zygosity, 12 | has_Hereditary_thrombophilia_includes_Factor_V_Leiden_and_Prothrombin_G20210A AS has_Hereditary_thrombophilia 13 | FROM 14 | [google.com:biggene:pgp.cgi_variants] AS var 15 | LEFT OUTER JOIN 16 | [google.com:biggene:pgp.phenotypes] AS pheno 17 | ON 18 | pheno.Participant = var.sample_id 19 | WHERE 20 | chromosome = 'chr1' 21 | AND locusBegin <= 169519048 22 | AND locusEnd >= 169519049 23 | ORDER BY 24 | sample_id 25 | -------------------------------------------------------------------------------- /1000genomes/data-stories/README.md: -------------------------------------------------------------------------------- 1 | Data Stories 2 | ========================== 3 | 4 | The following sections demonstrate some interactive exploration within the 1,000 Genomes data set. 5 | 6 | * Getting Familiar with the Data 7 | * [Exploring the sample information data](./exploring-the-phenotypic-data) 8 | * [Exploring the variant data](./exploring-the-variant-data) 9 | * [Understanding Alternate Alleles in 1,000 Genomes](./understanding-alternate-alleles) 10 | * Comparisons to Common Tools and Research Results 11 | * [Reproducing the output of vcfstats](./reproducing-vcfstats) 12 | * [Reproducing Allelic Frequencies](./reproducing-allelic-frequencies) 13 | * [Reproducing the Hardy-Weinberg Equilibrium test](./reproducing-hardy-weinberg-equilibrium) 14 | 15 | -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-vcfstats/shared-variant-counts-brca1.sql: -------------------------------------------------------------------------------- 1 | # Count the number of variants shared by none, shared by one sample, shared by 2 | # two samples, etc... in BRCA1 3 | SELECT 4 | num_samples_with_variant, 5 | COUNT(1) AS num_variants_shared_by_this_many_samples 6 | FROM ( 7 | SELECT 8 | reference_name, 9 | start, 10 | END, 11 | reference_bases, 12 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 13 | SUM(NOT EVERY(call.genotype <= 0)) WITHIN call AS num_samples_with_variant 14 | FROM 15 | [genomics-public-data:1000_genomes.variants] 16 | WHERE 17 | reference_name = '17' 18 | AND start BETWEEN 41196311 19 | AND 41277499 20 | ) 21 | GROUP BY 22 | num_samples_with_variant 23 | ORDER BY 24 | num_samples_with_variant -------------------------------------------------------------------------------- /1000genomes/sql/phenotype_sql/ethnicity-ratio.sql: -------------------------------------------------------------------------------- 1 | # Compute sample count and ratio by ethnicity 2 | SELECT 3 | population, 4 | population_description, 5 | population_count, 6 | RATIO_TO_REPORT(population_count) 7 | OVER 8 | ( 9 | ORDER BY 10 | population_count) AS population_ratio, 11 | super_population, 12 | super_population_description, 13 | from( 14 | SELECT 15 | population, 16 | population_description, 17 | super_population, 18 | super_population_description, 19 | COUNT(population) AS population_count, 20 | FROM 21 | [genomics-public-data:1000_genomes.sample_info] 22 | WHERE 23 | In_Phase1_Integrated_Variant_Set = TRUE 24 | GROUP BY 25 | population, 26 | population_description, 27 | super_population, 28 | super_population_description) 29 | -------------------------------------------------------------------------------- /1000genomes/sql/shared-variant-counts.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | -- 3 | -- Count the number of variants shared by none, shared by one sample, two samples, etc... 4 | -- 5 | SELECT 6 | num_samples_with_variant, 7 | COUNT(1) AS num_variants_shared_by_this_many_samples 8 | FROM ( 9 | SELECT 10 | reference_name, 11 | start, 12 | `end`, 13 | reference_bases, 14 | alternate_bases[ORDINAL(1)] AS alt, -- 1000 Genomes is biallelic. 15 | (SELECT COUNTIF(EXISTS(SELECT gt 16 | FROM UNNEST(call.genotype) gt 17 | WHERE gt >= 1)) FROM v.call) AS num_samples_with_variant 18 | FROM 19 | `genomics-public-data.1000_genomes.variants` v 20 | WHERE 21 | reference_name NOT IN ("X", "Y", "MT")) 22 | GROUP BY 23 | num_samples_with_variant 24 | ORDER BY 25 | num_samples_with_variant 26 | -------------------------------------------------------------------------------- /1000genomes/sql/understanding-alternate-alleles/sample-likelihood.sql: -------------------------------------------------------------------------------- 1 | # Get data sufficient to make a judgment upon this particular sample's call. 2 | SELECT 3 | reference_name, 4 | start, 5 | reference_bases AS ref, 6 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 7 | GROUP_CONCAT(filter) WITHIN RECORD AS filters, 8 | avgpost, 9 | rsq 10 | vt, 11 | call.call_set_name AS sample_id, 12 | call.phaseset AS phaseset, 13 | NTH(1, call.genotype) WITHIN call AS first_allele, 14 | NTH(2, call.genotype) WITHIN call AS second_allele, 15 | call.ds AS ds, 16 | GROUP_CONCAT(STRING(call.genotype_likelihood)) WITHIN call AS likelihoods, 17 | FROM 18 | [genomics-public-data:1000_genomes.variants] 19 | WHERE 20 | reference_name = '17' 21 | AND start = 48515942 22 | HAVING 23 | sample_id = 'HG00100' 24 | ORDER BY 25 | alt 26 | -------------------------------------------------------------------------------- /pgp/data-stories/README.md: -------------------------------------------------------------------------------- 1 | Data Stories 2 | ========================== 3 | 4 | The following sections demonstrate some interactive exploration within the PGP dataset. 5 | 6 | * Getting Familiar with the Data 7 | * [Comparing PGP to 1000 Genomes](./comparing-pgp-to-1000genomes) 8 | * [Issues with the Variant-Centric Approach](./issues-with-the-variant-centric-approach) 9 | * [A Comparison of Schemas and Data Encodings](./schema-comparisons) 10 | 11 | 12 | Have other data stories you would like to see here? Have any data stories you would like to *share*? Have *corrections to the biology* covered in this material? Have query *simplifications* or *speed improvements*? Let us know by [filing an issue](https://github.com/googlegenomics/bigquery-examples/issues) or [contacting us directly](mailto:google-genomics-contact@googlegroups.com). 13 | -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-vcfstats/indel-length-counts-brca1.sql: -------------------------------------------------------------------------------- 1 | # Count the number of INDELs differing from the reference allele by particular 2 | # lengths for BRCA1. 3 | SELECT 4 | length_difference, 5 | COUNT(length_difference) AS count_of_indels_with_length_difference, 6 | FROM ( 7 | SELECT 8 | reference_name, 9 | start, 10 | reference_bases, 11 | LENGTH(reference_bases) AS ref_length, 12 | alternate_bases AS allele, 13 | LENGTH(alternate_bases) AS allele_length, 14 | (LENGTH(alternate_bases) - LENGTH(reference_bases)) AS length_difference, 15 | FROM 16 | [genomics-public-data:1000_genomes.variants] 17 | WHERE 18 | reference_name = '17' 19 | AND start BETWEEN 41196311 20 | AND 41277499 21 | AND vt ='INDEL' 22 | ) 23 | GROUP BY 24 | length_difference 25 | ORDER BY 26 | length_difference -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-vcfstats/private-variant-counts-brca1.sql: -------------------------------------------------------------------------------- 1 | # Compute the number of variants within BRCA1 for a particular sample that are shared by 2 | # no other samples. 3 | SELECT 4 | COUNT(sample_id) AS private_variants_count, 5 | sample_id 6 | FROM 7 | ( 8 | SELECT 9 | reference_name, 10 | start, 11 | reference_bases, 12 | IF(0 < call.genotype, 13 | call.call_set_name, 14 | NULL) AS sample_id, 15 | SUM(IF(0 < call.genotype, 16 | 1, 17 | 0)) WITHIN RECORD AS num_samples_with_variant 18 | FROM 19 | [genomics-public-data:1000_genomes.variants] 20 | WHERE 21 | reference_name = '17' 22 | AND start BETWEEN 41196311 23 | AND 41277499 24 | HAVING 25 | num_samples_with_variant = 1 26 | AND sample_id IS NOT NULL) 27 | GROUP EACH BY 28 | sample_id 29 | ORDER BY 30 | sample_id 31 | -------------------------------------------------------------------------------- /1000genomes/sql/sample-variant-counts-by-type-and-chromosome.sql: -------------------------------------------------------------------------------- 1 | # Count the number of variants for each sample across the entirety of the 1,000 2 | # Genomes dataset by variant type and chromosome. 3 | SELECT 4 | reference_name, 5 | vt, 6 | sample_id, 7 | COUNT(sample_id) AS variant_count, 8 | FROM 9 | ( 10 | SELECT 11 | reference_name, 12 | vt, 13 | call.call_set_name AS sample_id, 14 | NTH(1, 15 | call.genotype) WITHIN call AS first_allele, 16 | NTH(2, 17 | call.genotype) WITHIN call AS second_allele, 18 | FROM 19 | [genomics-public-data:1000_genomes.variants] 20 | HAVING 21 | first_allele > 0 22 | OR (second_allele IS NOT NULL 23 | AND second_allele > 0)) 24 | GROUP BY 25 | sample_id, 26 | reference_name, 27 | vt 28 | ORDER BY 29 | reference_name, 30 | vt, 31 | variant_count, 32 | sample_id 33 | -------------------------------------------------------------------------------- /pgp/sql/issues-with-the-variant-centric-approach/factor-v-leiden-summary.sql: -------------------------------------------------------------------------------- 1 | # Summary data for rs6025 and hereditary thrombophilia trait 2 | # for use in the Factor V Leiden data story. 3 | SELECT 4 | COUNT(sample_id) AS sample_counts, 5 | chromosome, 6 | reference, 7 | allele1Seq, 8 | allele2Seq, 9 | has_Hereditary_thrombophilia_includes_Factor_V_Leiden_and_Prothrombin_G20210A AS has_Hereditary_thrombophilia 10 | FROM 11 | [google.com:biggene:pgp.cgi_variants] AS var 12 | LEFT OUTER JOIN 13 | [google.com:biggene:pgp.phenotypes] AS pheno 14 | ON 15 | pheno.Participant = var.sample_id 16 | WHERE 17 | chromosome = 'chr1' 18 | AND locusBegin <= 169519048 19 | AND locusEnd >= 169519049 20 | GROUP BY 21 | chromosome, 22 | reference, 23 | allele1Seq, 24 | allele2Seq, 25 | has_Hereditary_thrombophilia 26 | ORDER BY 27 | sample_counts DESC 28 | -------------------------------------------------------------------------------- /pgp/sql/gvcf_variants_expanded/klotho.sql: -------------------------------------------------------------------------------- 1 | # Sample level data for Klotho variant rs9536314 for use in the "amazing 2 | # intelligence of PGP participants" data story, specifically joining two 3 | # tables to compare the different encodings. 4 | SELECT 5 | contig_name, 6 | start_pos, 7 | end_pos, 8 | END, 9 | ref, 10 | alt, 11 | sample_id, 12 | genotype 13 | FROM 14 | FLATTEN( 15 | SELECT 16 | contig_name, 17 | start_pos, 18 | end_pos, 19 | END, 20 | reference_bases AS ref, 21 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 22 | call.callset_name AS sample_id, 23 | GROUP_CONCAT(STRING(call.genotype), 24 | '/') WITHIN call AS genotype, 25 | FROM 26 | [google.com:biggene:test.pgp_gvcf_variants_expanded2] 27 | WHERE 28 | contig_name = '13' 29 | AND start_pos == 33628138 30 | , call) 31 | ORDER BY 32 | sample_id 33 | -------------------------------------------------------------------------------- /1000genomes/sql/sample-level-data-for-brca1.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | -- 3 | -- Retrieve sample-level information for BRCA1 variants. 4 | -- 5 | SELECT 6 | reference_name, 7 | start, 8 | `end`, 9 | reference_bases, 10 | ARRAY_TO_STRING(v.alternate_bases, ',') AS alts, 11 | quality, 12 | ARRAY_TO_STRING(v.filter, ',') AS filters, 13 | vt, 14 | ARRAY_TO_STRING(v.names, ',') AS names, 15 | call.call_set_name, 16 | call.phaseset, 17 | (SELECT STRING_AGG(CAST(gt AS STRING)) from UNNEST(call.genotype) gt) AS genotype, 18 | call.ds, 19 | (SELECT STRING_AGG(CAST(lh AS STRING)) from UNNEST(call.genotype_likelihood) lh) AS likelihoods 20 | FROM 21 | `genomics-public-data.1000_genomes.variants` v, v.call call 22 | WHERE 23 | reference_name IN ('17', 'chr17') 24 | AND start BETWEEN 41196311 AND 41277499 # per GRCh37 25 | AND call_set_name = 'HG00100' 26 | ORDER BY 27 | start, 28 | alts 29 | -------------------------------------------------------------------------------- /pgp/sql/schema-comparisons/missingness-klotho.sql: -------------------------------------------------------------------------------- 1 | # Missingness rate for Klotho variant rs9536314 in the "amazing 2 | # intelligence of PGP participants" data story. 3 | SELECT 4 | COUNT(sample_id) AS num_samples_called_for_position, 5 | SUM(called_count) AS num_alleles_called_for_position, 6 | 1 - (SUM(called_count)/(172*2)) AS missingness_rate 7 | FROM ( 8 | SELECT 9 | contig_name, 10 | start_pos, 11 | end_pos, 12 | END, 13 | reference_bases, 14 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 15 | call.callset_name AS sample_id, 16 | GROUP_CONCAT(STRING(call.genotype), 17 | '/') WITHIN call AS genotype, 18 | SUM(call.genotype >= 0) WITHIN RECORD as called_count, 19 | FROM 20 | [google.com:biggene:pgp.gvcf_variants] 21 | WHERE 22 | contig_name = '13' 23 | AND start_pos <= 33628138 24 | AND (end_pos = 33628139 25 | OR END >= 33628139) 26 | ) 27 | -------------------------------------------------------------------------------- /pgp/sql/gvcf_variants/klotho.sql: -------------------------------------------------------------------------------- 1 | # Sample level data for Klotho variant rs9536314 for use in the "amazing 2 | # intelligence of PGP participants" data story, specifically joining two 3 | # tables to compare the different encodings. 4 | SELECT 5 | contig_name, 6 | start_pos, 7 | end_pos, 8 | END, 9 | ref, 10 | alt, 11 | sample_id, 12 | genotype 13 | FROM 14 | FLATTEN( 15 | SELECT 16 | contig_name, 17 | start_pos, 18 | end_pos, 19 | END, 20 | reference_bases AS ref, 21 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 22 | call.callset_name AS sample_id, 23 | GROUP_CONCAT(STRING(call.genotype), 24 | '/') WITHIN call AS genotype, 25 | FROM 26 | [google.com:biggene:pgp.gvcf_variants] 27 | WHERE 28 | contig_name = '13' 29 | AND start_pos <= 33628138 30 | AND (end_pos >= 33628139 31 | OR END >= 33628139) 32 | , 33 | call) 34 | ORDER BY 35 | sample_id 36 | -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-vcfstats/variant-sample-counts-brca1.sql: -------------------------------------------------------------------------------- 1 | # Count the number of samples that have the BRCA1 variant. 2 | SELECT 3 | reference_name, 4 | start, 5 | reference_bases, 6 | SUM(first_allele > 0 7 | OR second_allele > 0) AS num_samples_with_variant 8 | FROM( 9 | SELECT 10 | reference_name, 11 | start, 12 | reference_bases, 13 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 14 | NTH(1, 15 | call.genotype) WITHIN call AS first_allele, 16 | NTH(2, 17 | call.genotype) WITHIN call AS second_allele, 18 | FROM 19 | [genomics-public-data:1000_genomes.variants] 20 | WHERE 21 | reference_name = '17' 22 | AND start BETWEEN 41196311 23 | AND 41277499 24 | AND vt ='SNP' 25 | ) 26 | GROUP BY 27 | reference_name, 28 | start, 29 | reference_bases, 30 | alt 31 | ORDER BY 32 | num_samples_with_variant, 33 | start 34 | -------------------------------------------------------------------------------- /pgp/sql/comparing-pgp-to-1000genomes/variant-counts-by-chromosome.sql: -------------------------------------------------------------------------------- 1 | # Count the number of variants per chromosome. 2 | SELECT 3 | reference_name, 4 | cnt, 5 | dataset 6 | FROM ( 7 | SELECT 8 | reference_name, 9 | COUNT(reference_name) AS cnt, 10 | '1000Genomes' AS dataset 11 | FROM 12 | [genomics-public-data:1000_genomes.variants] 13 | GROUP BY 14 | reference_name 15 | ), 16 | ( 17 | SELECT 18 | # Normalize the reference_name to match that found in 1,000 Genomes. 19 | IF(reference_name = 'chrM', 'MT', SUBSTR(reference_name, 4)) AS reference_name, 20 | COUNT(reference_name) AS cnt, 21 | 'PGP' AS dataset 22 | FROM 23 | [google.com:biggene:pgp_20150205.genome_calls] 24 | # The source data was Complete Genomics which includes non-variant segments. 25 | OMIT RECORD IF EVERY(alternate_bases IS NULL) 26 | GROUP BY 27 | reference_name) 28 | ORDER BY 29 | reference_name, 30 | dataset 31 | -------------------------------------------------------------------------------- /1000genomes/sql/understanding-alternate-alleles/count-by-var-type-chrom-pos-ref-dups.sql: -------------------------------------------------------------------------------- 1 | # Count by variant type the number of alternate variants on chromosome 17 for the same 2 | # start and reference base 3 | SELECT 4 | vt, 5 | COUNT(vt) AS num_variant_type 6 | FROM 7 | [genomics-public-data:1000_genomes.variants] AS variants 8 | JOIN ( 9 | SELECT 10 | reference_name, 11 | start, 12 | reference_bases, 13 | COUNT(start) AS num_alternates, 14 | FROM 15 | [genomics-public-data:1000_genomes.variants] 16 | WHERE 17 | reference_name = '17' 18 | GROUP EACH BY 19 | reference_name, 20 | start, 21 | reference_bases 22 | HAVING 23 | num_alternates > 1) AS dups 24 | ON 25 | variants.reference_name = dups.reference_name 26 | AND variants.start = dups.start 27 | AND variants.reference_bases = dups.reference_bases 28 | WHERE 29 | variants.reference_name = '17' 30 | GROUP EACH BY 31 | vt 32 | ORDER BY 33 | vt 34 | -------------------------------------------------------------------------------- /pgp/sql/comparing-pgp-to-1000genomes/parsed-genotype-counts.sql: -------------------------------------------------------------------------------- 1 | # Count the number of sample genotypes, parsed into components. 2 | SELECT 3 | first_allele, 4 | second_allele, 5 | dataset, 6 | # Convert integer to float to avoid numeric overflow in R for integers. 7 | FLOAT(COUNT(1)) AS cnt 8 | FROM ( 9 | SELECT 10 | NTH(1, call.genotype) WITHIN call AS first_allele, 11 | NTH(2, call.genotype) WITHIN call AS second_allele, 12 | '1000Genomes' AS dataset 13 | FROM 14 | [genomics-public-data:1000_genomes.variants] 15 | OMIT RECORD IF reference_name IN ('X', 'Y', 'MT')), 16 | ( 17 | SELECT 18 | NTH(1, call.genotype) WITHIN call AS first_allele, 19 | NTH(2, call.genotype) WITHIN call AS second_allele, 20 | 'PGP' AS dataset 21 | FROM 22 | [google.com:biggene:pgp_20150205.genome_calls] 23 | OMIT RECORD IF reference_name IN ('chrX', 'chrY', 'chrM')) 24 | GROUP BY 25 | first_allele, 26 | second_allele, 27 | dataset 28 | -------------------------------------------------------------------------------- /1000genomes/sql/understanding-alternate-alleles/count-by-var-type-chrom-pos-ref-singles.sql: -------------------------------------------------------------------------------- 1 | # Count by variant type the number of variants on chromosome 17 unique for a 2 | # start and reference base 3 | SELECT 4 | vt, 5 | COUNT(vt) AS num_variant_type 6 | FROM 7 | [genomics-public-data:1000_genomes.variants] AS variants 8 | JOIN EACH ( 9 | SELECT 10 | reference_name, 11 | start, 12 | reference_bases, 13 | COUNT(start) AS num_alternates 14 | FROM 15 | [genomics-public-data:1000_genomes.variants] 16 | WHERE 17 | reference_name = '17' 18 | GROUP EACH BY 19 | reference_name, 20 | start, 21 | reference_bases 22 | HAVING 23 | num_alternates = 1) AS singles 24 | ON 25 | variants.reference_name = singles.reference_name 26 | AND variants.start = singles.start 27 | AND variants.reference_bases = singles.reference_bases 28 | WHERE 29 | variants.reference_name = '17' 30 | GROUP EACH BY 31 | vt 32 | ORDER BY 33 | vt 34 | -------------------------------------------------------------------------------- /1000genomes/sql/ti-tv-ratio.sql: -------------------------------------------------------------------------------- 1 | # Compute the Ti/Tv ratio of the 1,000 Genomes dataset. 2 | SELECT 3 | transitions, 4 | transversions, 5 | transitions/transversions AS titv 6 | FROM ( 7 | SELECT 8 | SUM(IF(mutation IN ('A->G', 9 | 'G->A', 10 | 'C->T', 11 | 'T->C'), 12 | INTEGER(num_snps), 13 | INTEGER(0))) AS transitions, 14 | SUM(IF(mutation IN ('A->C', 15 | 'C->A', 16 | 'G->T', 17 | 'T->G', 18 | 'A->T', 19 | 'T->A', 20 | 'C->G', 21 | 'G->C'), 22 | INTEGER(num_snps), 23 | INTEGER(0))) AS transversions, 24 | FROM ( 25 | SELECT 26 | CONCAT(reference_bases, 27 | CONCAT(STRING('->'), 28 | alternate_bases)) AS mutation, 29 | COUNT(alternate_bases) AS num_snps, 30 | FROM 31 | [genomics-public-data:1000_genomes.variants] 32 | WHERE 33 | vt = 'SNP' 34 | GROUP BY 35 | mutation 36 | ORDER BY 37 | mutation)) 38 | -------------------------------------------------------------------------------- /pgp/sql/comparing-pgp-to-1000genomes/taking-a-closer-look-at-variant-types.sql: -------------------------------------------------------------------------------- 1 | # Inner SELECT filters just the records in which we are interested. 2 | # Outer SELECT performs our analysis, in this case just a count of the genotypes 3 | # at a particular position in chromosome 3. 4 | SELECT 5 | reference_name, 6 | start, 7 | reference_bases, 8 | alternate_bases, 9 | genotype, 10 | COUNT(genotype) AS number_of_individuals, 11 | FROM ( 12 | SELECT 13 | reference_name, 14 | start, 15 | reference_bases, 16 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alternate_bases, 17 | call.callset_name, 18 | GROUP_CONCAT(STRING(call.genotype)) WITHIN call AS genotype, 19 | FROM 20 | [google.com:biggene:pgp_20150205.genome_calls] 21 | WHERE 22 | reference_name = 'chr3' 23 | AND start = 65440409) 24 | GROUP BY 25 | reference_name, 26 | start, 27 | reference_bases, 28 | alternate_bases, 29 | genotype 30 | ORDER BY 31 | alternate_bases, 32 | number_of_individuals DESC 33 | -------------------------------------------------------------------------------- /1000genomes/sql/heterozygous-homozygous-ratio.sql: -------------------------------------------------------------------------------- 1 | # Count the homozygous and heterozygous variants for each sample across the 2 | # entirety of the 1,000 Genomes dataset. 3 | SELECT 4 | sample_id, 5 | SUM(IF(0 = first_allele 6 | AND 0 = second_allele, 7 | 1, 8 | 0)) AS hom_RR_count, 9 | SUM(IF(first_allele = second_allele 10 | AND first_allele > 0, 11 | 1, 12 | 0)) AS hom_AA_count, 13 | SUM(IF((first_allele != second_allele 14 | OR second_allele IS NULL) 15 | AND (first_allele > 0 16 | OR second_allele > 0), 17 | 1, 18 | 0)) AS het_RA_count 19 | FROM ( 20 | SELECT 21 | reference_name, 22 | call.call_set_name AS sample_id, 23 | NTH(1, 24 | call.genotype) WITHIN call AS first_allele, 25 | NTH(2, 26 | call.genotype) WITHIN call AS second_allele, 27 | FROM 28 | [genomics-public-data:1000_genomes.variants] 29 | WHERE 30 | reference_name != 'Y' AND reference_name != 'M' 31 | ) 32 | GROUP BY 33 | sample_id 34 | ORDER BY 35 | sample_id 36 | -------------------------------------------------------------------------------- /1000genomes/sql/minimum-allelic-frequency-by-ethnicity.sql: -------------------------------------------------------------------------------- 1 | # Count the variation for each sample including phenotypic traits 2 | SELECT 3 | samples.call.call_set_name AS sample_id, 4 | gender, 5 | population, 6 | super_population, 7 | COUNT(samples.call.call_set_name) AS num_variants_for_sample, 8 | SUM(samples.af >= 0.05) AS common_variant, 9 | SUM(samples.af < 0.05 AND samples.af > 0.005) AS middle_variant, 10 | SUM(samples.af <= 0.005 AND samples.af > 0.001) AS rare_variant, 11 | SUM(samples.af <= 0.001) AS very_rare_variant, 12 | FROM 13 | FLATTEN(( 14 | SELECT 15 | af, 16 | vt, 17 | call.call_set_name, 18 | FROM 19 | [genomics-public-data:1000_genomes.variants] 20 | WHERE 21 | vt = 'SNP' 22 | OMIT call IF EVERY(call.genotype <= 0)), 23 | call) AS samples 24 | JOIN 25 | [genomics-public-data:1000_genomes.sample_info] p 26 | ON 27 | samples.call.call_set_name = p.sample 28 | GROUP BY 29 | sample_id, 30 | gender, 31 | population, 32 | super_population 33 | ORDER BY 34 | sample_id 35 | -------------------------------------------------------------------------------- /1000genomes/sql/variant-hotspots.sql: -------------------------------------------------------------------------------- 1 | # Summarize the variant counts by 10,000 start-wide windows in order to identify 2 | # variant hotspots within a chromosome for all samples. 3 | SELECT 4 | reference_name, 5 | window, 6 | window * 10000 AS window_start, 7 | ((window * 10000) + 9999) AS window_end, 8 | MIN(start) AS min_variant_start, 9 | MAX(start) AS max_variant_start, 10 | COUNT(sample_id) AS num_variants_in_window, 11 | FROM ( 12 | SELECT 13 | reference_name, 14 | start, 15 | INTEGER(FLOOR(start / 10000)) AS window, 16 | call.call_set_name AS sample_id, 17 | NTH(1, 18 | call.genotype) WITHIN call AS first_allele, 19 | NTH(2, 20 | call.genotype) WITHIN call AS second_allele, 21 | FROM 22 | [genomics-public-data:1000_genomes.variants] 23 | HAVING 24 | first_allele > 0 25 | OR (second_allele IS NOT NULL 26 | AND second_allele > 0)) 27 | GROUP BY 28 | reference_name, 29 | window, 30 | window_start, 31 | window_end, 32 | ORDER BY 33 | num_variants_in_window DESC, 34 | window 35 | -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-vcfstats/ti-tv-ratio-brca1.sql: -------------------------------------------------------------------------------- 1 | # Compute the Ti/Tv ratio for BRCA1. 2 | SELECT 3 | transitions, 4 | transversions, 5 | transitions/transversions AS titv 6 | FROM ( 7 | SELECT 8 | SUM(IF(mutation IN ('A->G', 9 | 'G->A', 10 | 'C->T', 11 | 'T->C'), 12 | INTEGER(num_snps), 13 | INTEGER(0))) AS transitions, 14 | SUM(IF(mutation IN ('A->C', 15 | 'C->A', 16 | 'G->T', 17 | 'T->G', 18 | 'A->T', 19 | 'T->A', 20 | 'C->G', 21 | 'G->C'), 22 | INTEGER(num_snps), 23 | INTEGER(0))) AS transversions, 24 | FROM ( 25 | SELECT 26 | CONCAT(reference_bases, 27 | CONCAT(STRING('->'), 28 | alternate_bases)) AS mutation, 29 | COUNT(alternate_bases) AS num_snps, 30 | FROM 31 | [genomics-public-data:1000_genomes.variants] 32 | WHERE 33 | reference_name = '17' 34 | AND start BETWEEN 41196311 35 | AND 41277499 36 | AND vt = 'SNP' 37 | GROUP BY 38 | mutation 39 | ORDER BY 40 | mutation)) -------------------------------------------------------------------------------- /platinumGenomes/sql/sample-snps-by-exonic-function.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | -- 3 | -- Count SNPs by functional impact for each sample in Platinum Genomes. 4 | -- 5 | WITH 6 | sample_variants AS ( 7 | SELECT 8 | REGEXP_EXTRACT(reference_name, r'chr(.+)') AS chr, 9 | start AS start, 10 | reference_bases, 11 | alt, 12 | call.call_set_name 13 | FROM 14 | `genomics-public-data.platinum_genomes.variants` v, 15 | v.call call, 16 | v.alternate_bases alt WITH OFFSET alt_offset 17 | WHERE 18 | -- Require that at least one genotype matches this alternate. 19 | EXISTS (SELECT gt FROM UNNEST(call.genotype) gt WHERE gt = alt_offset+1) 20 | ) 21 | -- 22 | -- 23 | SELECT 24 | call_set_name, 25 | ExonicFunc, 26 | COUNT(ExonicFunc) AS variant_count 27 | FROM 28 | `silver-wall-555.TuteTable.hg19` AS annots 29 | JOIN sample_variants AS vars 30 | ON 31 | vars.chr = annots.Chr 32 | AND vars.start = annots.Start 33 | AND vars.reference_bases = annots.Ref 34 | AND vars.alt = annots.Alt 35 | WHERE 36 | ExonicFunc IS NOT NULL 37 | GROUP BY 38 | call_set_name, 39 | ExonicFunc 40 | ORDER BY 41 | call_set_name, 42 | ExonicFunc 43 | -------------------------------------------------------------------------------- /1000genomes/sql/sample-variant-hotspots.sql: -------------------------------------------------------------------------------- 1 | # Summarize the variant counts for a particular sample by 10,000 start-wide windows 2 | # in order to identify variant hotspots within a chromosome for a particular sample. 3 | SELECT 4 | reference_name, 5 | window, 6 | window * 10000 AS window_start, 7 | ((window * 10000) + 9999) AS window_end, 8 | MIN(start) AS min_variant_start, 9 | MAX(start) AS max_variant_start, 10 | sample_id, 11 | COUNT(sample_id) AS num_variants_in_window, 12 | FROM ( 13 | SELECT 14 | reference_name, 15 | start, 16 | INTEGER(FLOOR(start / 10000)) AS window, 17 | call.call_set_name AS sample_id, 18 | NTH(1, 19 | call.genotype) WITHIN call AS first_allele, 20 | NTH(2, 21 | call.genotype) WITHIN call AS second_allele, 22 | FROM 23 | [genomics-public-data:1000_genomes.variants] 24 | WHERE 25 | call.call_set_name = 'HG00096' 26 | HAVING 27 | first_allele > 0 28 | OR (second_allele IS NOT NULL 29 | AND second_allele > 0)) 30 | GROUP BY 31 | reference_name, 32 | window, 33 | window_start, 34 | window_end, 35 | sample_id 36 | ORDER BY 37 | num_variants_in_window DESC, 38 | window 39 | -------------------------------------------------------------------------------- /1000genomes/sql/minimum-allelic-frequency-by-ethnicity-no-sex-chromosomes.sql: -------------------------------------------------------------------------------- 1 | # Count the variation for each sample including phenotypic traits but excluding 2 | # sex chromosomes. 3 | SELECT 4 | samples.call.call_set_name AS sample_id, 5 | gender, 6 | population, 7 | super_population, 8 | COUNT(samples.call.call_set_name) AS num_variants_for_sample, 9 | SUM(samples.af >= 0.05) AS common_variant, 10 | SUM(samples.af < 0.05 AND samples.af > 0.005) AS middle_variant, 11 | SUM(samples.af <= 0.005 AND samples.af > 0.001) AS rare_variant, 12 | SUM(samples.af <= 0.001) AS very_rare_variant, 13 | FROM 14 | FLATTEN(( 15 | SELECT 16 | af, 17 | vt, 18 | call.call_set_name, 19 | FROM 20 | [genomics-public-data:1000_genomes.variants] 21 | WHERE 22 | vt = 'SNP' 23 | AND reference_name != 'X' 24 | AND reference_name != 'Y' 25 | OMIT call IF EVERY(call.genotype <= 0)), 26 | call) AS samples 27 | JOIN 28 | [genomics-public-data:1000_genomes.sample_info] p 29 | ON 30 | samples.call.call_set_name = p.sample 31 | GROUP BY 32 | sample_id, 33 | gender, 34 | population, 35 | super_population 36 | ORDER BY 37 | sample_id 38 | -------------------------------------------------------------------------------- /pgp/sql/gvcf_variants/allelic-frequency-comparison.sql: -------------------------------------------------------------------------------- 1 | # PGP vs. 1,000 Genomes allelic frequency comparison for BRCA1 variants. 2 | SELECT 3 | contig_name, 4 | pgp.reference_bases AS reference_bases, 5 | start_pos, 6 | allele, 7 | pgp_freq, 8 | af, 9 | eur_af, 10 | afr_af, 11 | asn_af, 12 | amr_af 13 | FROM ( 14 | FLATTEN(( 15 | SELECT 16 | reference_name, 17 | start, 18 | reference_bases, 19 | alternate_bases, 20 | AF, 21 | AFR_AF, 22 | AMR_AF, 23 | ASN_AF, 24 | EUR_AF 25 | FROM 26 | [genomics-public-data:1000_genomes.variants]), 27 | alternate_bases)) AS kg 28 | JOIN 29 | EACH ( 30 | SELECT 31 | contig_name, 32 | reference_bases, 33 | start_pos, 34 | allele, 35 | freq AS pgp_freq 36 | FROM 37 | [google.com:biggene:pgp_analysis_results.gvcf_variants_allelic_frequency] 38 | ) AS pgp 39 | ON 40 | pgp.contig_name = kg.reference_name 41 | AND pgp.start_pos = kg.start 42 | AND pgp.reference_bases = kg.reference_bases 43 | AND pgp.allele = kg.alternate_bases 44 | WHERE 45 | kg.reference_name = '17' 46 | AND kg.start BETWEEN 41196312 47 | AND 41277500 48 | -------------------------------------------------------------------------------- /pgp/sql/cgi_variants/allelic-frequency-comparison.sql: -------------------------------------------------------------------------------- 1 | # PGP vs. 1,000 Genomes allelic frequency comparison for BRCA1 variants. 2 | SELECT 3 | chromosome, 4 | reference, 5 | locusBegin, 6 | locusEnd, 7 | allele, 8 | pgp_freq, 9 | af, 10 | eur_af, 11 | afr_af, 12 | asn_af, 13 | amr_af 14 | FROM ( 15 | FLATTEN(( 16 | SELECT 17 | reference_name, 18 | start, 19 | reference_bases, 20 | alternate_bases, 21 | AF, 22 | AFR_AF, 23 | AMR_AF, 24 | ASN_AF, 25 | EUR_AF 26 | FROM 27 | [genomics-public-data:1000_genomes.variants]), 28 | alternate_bases)) AS kg 29 | JOIN 30 | EACH ( 31 | SELECT 32 | chromosome, 33 | REGEXP_EXTRACT(chromosome, 34 | r'chr(\d+)') AS contig, 35 | reference, 36 | locusBegin + 1 AS position, 37 | locusBegin, 38 | locusEnd, 39 | allele, 40 | freq AS pgp_freq 41 | FROM 42 | [google.com:biggene:pgp_analysis_results.cgi_variants_allelic_frequency] 43 | ) AS pgp 44 | ON 45 | pgp.contig = kg.reference_name 46 | AND pgp.position = kg.start 47 | AND pgp.reference = kg.reference_bases 48 | AND pgp.allele = kg.alternate_bases 49 | WHERE 50 | kg.reference_name = '17' 51 | AND kg.start BETWEEN 41196312 52 | AND 41277500 53 | -------------------------------------------------------------------------------- /pgp/sql/comparing-pgp-to-1000genomes/variant-counts-by-type-and-chromosome.sql: -------------------------------------------------------------------------------- 1 | # Count the number of variants by variant type and chromosome. 2 | SELECT 3 | reference_name, 4 | vt, 5 | cnt, 6 | dataset 7 | FROM ( 8 | SELECT 9 | # Normalize the reference_name to match that found in 1,000 Genomes. 10 | IF(reference_name = 'chrM', 'MT', SUBSTR(reference_name, 4)) AS reference_name, 11 | IF(ref_len = 1 AND alt_len = 1, "SNP", "INDEL") AS vt, 12 | COUNT(reference_name) AS cnt, 13 | 'PGP' AS dataset 14 | FROM ( 15 | SELECT 16 | reference_name, 17 | svtype, 18 | LENGTH(reference_bases) AS ref_len, 19 | MAX(LENGTH(alternate_bases)) WITHIN RECORD AS alt_len, 20 | FROM 21 | [google.com:biggene:pgp_20150205.genome_calls] 22 | # The source data was Complete Genomics which includes non-variant segments. 23 | OMIT RECORD IF EVERY(alternate_bases IS NULL) 24 | ) 25 | GROUP BY 26 | reference_name, 27 | vt 28 | ), 29 | ( 30 | SELECT 31 | reference_name, 32 | IF(vt IS NULL, "not specified", vt) AS vt, 33 | COUNT(reference_name) AS cnt, 34 | '1000Genomes' AS dataset 35 | FROM 36 | [genomics-public-data:1000_genomes.variants] 37 | GROUP BY 38 | reference_name, 39 | vt 40 | ), 41 | ORDER BY 42 | reference_name, 43 | dataset, 44 | vt 45 | -------------------------------------------------------------------------------- /platinumGenomes/sql/cohort-rare-pathenogenic-snps.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | -- 3 | -- Return all SNPs from the Platinum Genomes cohort that are: 4 | -- annotated as 'pathogenic' in ClinVar 5 | -- with observed population frequency less than 1% 6 | -- 7 | WITH 8 | cohort_variants AS ( 9 | SELECT 10 | REGEXP_EXTRACT(reference_name, r'chr(.+)') AS chr, 11 | start AS start, 12 | reference_bases, 13 | alt 14 | FROM 15 | `genomics-public-data.platinum_genomes.variants` v, 16 | v.alternate_bases alt WITH OFFSET alt_offset 17 | WHERE 18 | -- Require that at least one sample in the cohort has this variant. 19 | EXISTS(SELECT gt FROM UNNEST(v.call) call, UNNEST(call.genotype) gt WHERE gt = alt_offset+1) 20 | ) 21 | -- 22 | -- 23 | SELECT 24 | annots.Chr, 25 | annots.Start, 26 | Ref, 27 | annots.Alt, 28 | Func, 29 | Gene, 30 | PopFreqMax, 31 | ExonicFunc, 32 | ClinVar_SIG, 33 | ClinVar_DIS 34 | FROM 35 | `silver-wall-555.TuteTable.hg19` AS annots 36 | JOIN 37 | cohort_variants AS vars 38 | ON 39 | vars.chr = annots.Chr 40 | AND vars.start = annots.Start 41 | AND vars.reference_bases = annots.Ref 42 | AND vars.alt = annots.Alt 43 | WHERE 44 | PopFreqMax <= 0.01 45 | AND ClinVar_SIG LIKE '%pathogenic%' 46 | AND NOT CLinVar_SIG LIKE '%non-pathogenic%' 47 | ORDER BY 48 | Chr, 49 | Start, 50 | Ref, 51 | Alt 52 | -------------------------------------------------------------------------------- /1000genomes/sql/understanding-alternate-alleles/sample-chrom-pos-ref-dups.sql: -------------------------------------------------------------------------------- 1 | # Get sample alleles for some specific variants. 2 | # TODO(deflaux): update this to a user-defined function to generalize 3 | # across more than two alternates. For more info, see 4 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377 5 | SELECT 6 | reference_name, 7 | start, 8 | alt, 9 | reference_bases, 10 | sample_id, 11 | CASE 12 | WHEN 0 = first_allele THEN reference_bases 13 | WHEN 1 = first_allele THEN alt1 14 | WHEN 2 = first_allele THEN alt2 END AS first_allele, 15 | CASE 16 | WHEN 0 = second_allele THEN reference_bases 17 | WHEN 1 = second_allele THEN alt1 18 | WHEN 2 = second_allele THEN alt2 END AS second_allele, 19 | FROM( 20 | SELECT 21 | reference_name, 22 | start, 23 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 24 | reference_bases, 25 | call.call_set_name AS sample_id, 26 | NTH(1, 27 | alternate_bases) WITHIN RECORD AS alt1, 28 | NTH(2, 29 | alternate_bases) WITHIN RECORD AS alt2, 30 | NTH(1, call.genotype) WITHIN call AS first_allele, 31 | NTH(2, call.genotype) WITHIN call AS second_allele, 32 | FROM 33 | [genomics-public-data:1000_genomes.variants] 34 | WHERE 35 | reference_name = '17' 36 | AND start = 48515942 37 | HAVING 38 | sample_id = 'HG00100' OR sample_id = 'HG00101') 39 | ORDER BY 40 | alt, 41 | sample_id 42 | -------------------------------------------------------------------------------- /platinumGenomes/sql/sample-rare-pathenogenic-snps.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | -- 3 | -- Return SNPs for sample NA12878 that are: 4 | -- annotated as 'pathogenic' in ClinVar 5 | -- with observed population frequency less than 1% 6 | -- 7 | WITH 8 | sample_variants AS ( 9 | SELECT 10 | REGEXP_EXTRACT(reference_name, r'chr(.+)') AS chr, 11 | start AS start, 12 | reference_bases, 13 | alt, 14 | call.call_set_name 15 | FROM 16 | `genomics-public-data.platinum_genomes.variants` v, 17 | v.call call, 18 | v.alternate_bases alt WITH OFFSET alt_offset 19 | WHERE 20 | call_set_name = 'NA12878' 21 | -- Require that at least one genotype matches this alternate. 22 | AND EXISTS (SELECT gt FROM UNNEST(call.genotype) gt WHERE gt = alt_offset+1) ) 23 | -- 24 | -- 25 | SELECT 26 | call_set_name, 27 | annots.Chr, 28 | annots.Start, 29 | Ref, 30 | annots.Alt, 31 | Func, 32 | Gene, 33 | PopFreqMax, 34 | ExonicFunc, 35 | ClinVar_SIG, 36 | ClinVar_DIS 37 | FROM 38 | `silver-wall-555.TuteTable.hg19` AS annots 39 | JOIN 40 | sample_variants AS vars 41 | ON 42 | vars.chr = annots.Chr 43 | AND vars.start = annots.Start 44 | AND vars.reference_bases = annots.Ref 45 | AND vars.alt = annots.Alt 46 | WHERE 47 | PopFreqMax <= 0.01 48 | AND ClinVar_SIG LIKE '%pathogenic%' 49 | AND NOT CLinVar_SIG LIKE '%non-pathogenic%' 50 | ORDER BY 51 | Chr, 52 | Start, 53 | Ref, 54 | Alt, 55 | call_set_name 56 | -------------------------------------------------------------------------------- /pgp/sql/schema-comparisons/call-counts.sql: -------------------------------------------------------------------------------- 1 | # Call counts for the PGP data encoded four different ways. 2 | SELECT 3 | chromosome, 4 | num_records, 5 | num_variants, 6 | dataset 7 | FROM 8 | ( 9 | SELECT 10 | SUBSTR(chromosome, 11 | 4) AS chromosome, 12 | COUNT(1) AS num_records, 13 | SUM(reference != '=') AS num_variants, 14 | 'cgi_variants' AS dataset 15 | FROM 16 | [google.com:biggene:pgp.cgi_variants] 17 | # Skip the genomes we were unable to convert to VCF/gVCF 18 | OMIT RECORD IF 19 | sample_id = 'huEDF7DA' OR sample_id = 'hu34D5B9' 20 | GROUP BY 21 | chromosome), 22 | ( 23 | SELECT 24 | contig_name AS chromosome, 25 | COUNT(1) AS num_records, 26 | SUM(reference_bases != 'N') AS num_variants, 27 | 'variants' AS dataset 28 | FROM 29 | [google.com:biggene:pgp.variants] 30 | GROUP BY 31 | chromosome), 32 | ( 33 | SELECT 34 | contig_name AS chromosome, 35 | COUNT(1) AS num_records, 36 | SUM(reference_bases != 'N') AS num_variants, 37 | 'gvcf_variants' AS dataset 38 | FROM 39 | [google.com:biggene:pgp.gvcf_variants] 40 | GROUP BY 41 | chromosome), 42 | ( 43 | SELECT 44 | contig_name AS chromosome, 45 | COUNT(1) AS num_records, 46 | SUM(reference_bases != 'N') AS num_variants, 47 | 'gvcf_variants_expanded' AS dataset 48 | FROM 49 | [google.com:biggene:test.pgp_gvcf_variants_expanded2] 50 | GROUP BY 51 | chromosome) 52 | ORDER BY 53 | chromosome, 54 | dataset 55 | -------------------------------------------------------------------------------- /pgp/sql/cgi_variants/allele-count.sql: -------------------------------------------------------------------------------- 1 | # Count the occurence of each variant allele across all participants in the 2 | # dataset. This returns a large result so be sure to materialize it into a 3 | # table for subsequent use. 4 | SELECT 5 | chromosome, 6 | reference, 7 | # This 'bin' can be use in subsequent interval JOINs 8 | INTEGER(FLOOR(locusBegin / 5000)) AS bin, 9 | locusBegin, 10 | locusEnd, 11 | allele, 12 | SUM(cnt) AS alternate_allele_count, 13 | FROM ( 14 | SELECT 15 | chromosome, 16 | reference, 17 | locusBegin, 18 | locusEnd, 19 | allele1Seq AS allele, 20 | COUNT(1) AS cnt 21 | FROM 22 | [google.com:biggene:pgp.cgi_variants] 23 | WHERE 24 | (reference != '=' OR reference IS NULL) 25 | AND allele1Seq != '?' 26 | AND (reference != allele1Seq OR reference IS NULL) 27 | GROUP EACH BY 28 | chromosome, 29 | reference, 30 | locusBegin, 31 | locusEnd, 32 | allele), 33 | ( 34 | SELECT 35 | chromosome, 36 | reference, 37 | locusBegin, 38 | locusEnd, 39 | allele2Seq AS allele, 40 | COUNT(1) AS cnt 41 | FROM 42 | [google.com:biggene:pgp.cgi_variants] 43 | WHERE 44 | (reference != '=' OR reference IS NULL) 45 | AND allele2Seq != '?' 46 | AND (reference != allele2Seq OR reference IS NULL) 47 | GROUP EACH BY 48 | chromosome, 49 | reference, 50 | locusBegin, 51 | locusEnd, 52 | allele) 53 | GROUP EACH BY 54 | chromosome, 55 | reference, 56 | bin, 57 | locusBegin, 58 | locusEnd, 59 | allele 60 | -------------------------------------------------------------------------------- /1000genomes/sql/allelic-frequency.sql: -------------------------------------------------------------------------------- 1 | # The following query computes the allelic frequency for BRCA1 variants in the 2 | # 1,000 Genomes dataset. 3 | SELECT 4 | reference_name, 5 | start, 6 | reference_bases, 7 | alternate_bases, 8 | alt, 9 | SUM(ref_count)+SUM(alt_count) AS num_sample_alleles, 10 | SUM(ref_count) AS ref_cnt, 11 | SUM(alt_count) AS alt_cnt, 12 | SUM(ref_count)/(SUM(ref_count)+SUM(alt_count)) AS ref_freq, 13 | SUM(alt_count)/(SUM(ref_count)+SUM(alt_count)) AS alt_freq, 14 | FROM ( 15 | SELECT 16 | reference_name, 17 | start, 18 | reference_bases, 19 | alternate_bases, 20 | alt, 21 | SUM(INTEGER(0 = call.genotype)) WITHIN RECORD AS ref_count, 22 | SUM(INTEGER(alt = call.genotype)) WITHIN RECORD AS alt_count 23 | FROM 24 | FLATTEN( 25 | FLATTEN(( 26 | SELECT 27 | reference_name, 28 | start, 29 | reference_bases, 30 | alternate_bases, 31 | POSITION(alternate_bases) AS alt, 32 | call.call_set_name, 33 | call.genotype, 34 | FROM 35 | [genomics-public-data:1000_genomes.variants] 36 | WHERE 37 | reference_name = '17' 38 | AND start BETWEEN 41196311 39 | AND 41277499 40 | AND vt='SNP' 41 | ), 42 | call), 43 | alt)) 44 | GROUP BY 45 | reference_name, 46 | start, 47 | reference_bases, 48 | alternate_bases, 49 | alt 50 | ORDER BY 51 | reference_name, 52 | start, 53 | reference_bases, 54 | alt, 55 | alternate_bases 56 | -------------------------------------------------------------------------------- /pgp/sql/cgi_variants/ti-tv-ratio.sql: -------------------------------------------------------------------------------- 1 | # Compute the Ti/Tv ratio for each participant in the PGP dataset. 2 | SELECT 3 | sample_id, 4 | transitions, 5 | transversions, 6 | transitions/transversions AS titv 7 | FROM ( 8 | SELECT 9 | sample_id, 10 | SUM(IF(mutation1 IN ('A->G', 11 | 'G->A', 12 | 'C->T', 13 | 'T->C'), 14 | 1, 15 | 0) + IF(mutation2 IN ('A->G', 16 | 'G->A', 17 | 'C->T', 18 | 'T->C'), 19 | 1, 20 | 0)) AS transitions, 21 | SUM(IF(mutation1 IN ('A->C', 22 | 'C->A', 23 | 'G->T', 24 | 'T->G', 25 | 'A->T', 26 | 'T->A', 27 | 'C->G', 28 | 'G->C'), 29 | 1, 30 | 0) + IF(mutation2 IN ('A->C', 31 | 'C->A', 32 | 'G->T', 33 | 'T->G', 34 | 'A->T', 35 | 'T->A', 36 | 'C->G', 37 | 'G->C'), 38 | 1, 39 | 0)) AS transversions, 40 | FROM ( 41 | SELECT 42 | sample_id, 43 | CONCAT(reference, 44 | CONCAT(STRING('->'), 45 | allele1Seq)) AS mutation1, 46 | CONCAT(reference, 47 | CONCAT(STRING('->'), 48 | allele2Seq)) AS mutation2, 49 | FROM 50 | [google.com:biggene:pgp.cgi_variants] 51 | WHERE 52 | # WHERE varType = 'snp' not correct since a row with both an indel 53 | # and a snp will be varType 'complex' 54 | reference != '=' 55 | AND LENGTH(reference) = 1 56 | ) 57 | GROUP BY 58 | sample_id) 59 | ORDER BY 60 | titv DESC 61 | -------------------------------------------------------------------------------- /pgp/sql/schema-comparisons/klotho-gvcf-expanded.sql: -------------------------------------------------------------------------------- 1 | # Sample level data for Klotho variant rs9536314 for use in the "amazing 2 | # intelligence of PGP participants" data story, specifically joining two 3 | # tables to compare the different encodings. 4 | SELECT 5 | cgi.sample_id, 6 | chromosome, 7 | locusBegin, 8 | locusEnd, 9 | reference, 10 | allele1Seq, 11 | allele2Seq, 12 | contig_name, 13 | start_pos, 14 | end_pos, 15 | END, 16 | ref, 17 | alt, 18 | gvcf.sample_id, 19 | genotype 20 | FROM 21 | [google.com:biggene:pgp.cgi_variants] AS cgi 22 | left OUTER JOIN 23 | ( 24 | SELECT 25 | contig_name, 26 | start_pos, 27 | end_pos, 28 | END, 29 | ref, 30 | alt, 31 | sample_id, 32 | genotype 33 | FROM 34 | FLATTEN( 35 | SELECT 36 | contig_name, 37 | start_pos, 38 | end_pos, 39 | END, 40 | reference_bases AS ref, 41 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 42 | call.callset_name AS sample_id, 43 | GROUP_CONCAT(STRING(call.genotype), 44 | '/') WITHIN call AS genotype, 45 | FROM 46 | [google.com:biggene:test.pgp_gvcf_variants_expanded2] 47 | WHERE 48 | contig_name = '13' 49 | AND start_pos == 33628138 50 | , 51 | call)) AS gvcf 52 | ON 53 | cgi.sample_id = gvcf.sample_id 54 | WHERE 55 | chromosome = "chr13" 56 | AND locusBegin <= 33628137 57 | AND locusEnd >= 33628138 58 | # Skip the genomes we were unable to convert to VCF/gVCF 59 | OMIT RECORD IF 60 | cgi.sample_id = 'huEDF7DA' OR cgi.sample_id = 'hu34D5B9' 61 | ORDER BY 62 | cgi.sample_id 63 | -------------------------------------------------------------------------------- /pgp/sql/schema-comparisons/klotho-gvcf.sql: -------------------------------------------------------------------------------- 1 | # Sample level data for Klotho variant rs9536314 for use in the "amazing 2 | # intelligence of PGP participants" data story, specifically joining two 3 | # tables to compare the different encodings. 4 | SELECT 5 | cgi.sample_id, 6 | chromosome, 7 | locusBegin, 8 | locusEnd, 9 | reference, 10 | allele1Seq, 11 | allele2Seq, 12 | contig_name, 13 | start_pos, 14 | end_pos, 15 | END, 16 | ref, 17 | alt, 18 | gvcf.sample_id, 19 | genotype 20 | FROM 21 | [google.com:biggene:pgp.cgi_variants] AS cgi 22 | left OUTER JOIN 23 | ( 24 | SELECT 25 | contig_name, 26 | start_pos, 27 | end_pos, 28 | END, 29 | ref, 30 | alt, 31 | sample_id, 32 | genotype 33 | FROM 34 | FLATTEN( 35 | SELECT 36 | contig_name, 37 | start_pos, 38 | end_pos, 39 | END, 40 | reference_bases AS ref, 41 | GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alt, 42 | call.callset_name AS sample_id, 43 | GROUP_CONCAT(STRING(call.genotype), 44 | '/') WITHIN call AS genotype, 45 | FROM 46 | [google.com:biggene:pgp.gvcf_variants] 47 | WHERE 48 | contig_name = '13' 49 | AND start_pos <= 33628138 50 | AND (end_pos >= 33628139 51 | OR END >= 33628139) 52 | , 53 | call)) AS gvcf 54 | ON 55 | cgi.sample_id = gvcf.sample_id 56 | WHERE 57 | chromosome = "chr13" 58 | AND locusBegin <= 33628137 59 | AND locusEnd >= 33628138 60 | # Skip the genomes we were unable to convert to VCF/gVCF 61 | OMIT RECORD IF 62 | cgi.sample_id = 'huEDF7DA' OR cgi.sample_id = 'hu34D5B9' 63 | ORDER BY 64 | cgi.sample_id 65 | -------------------------------------------------------------------------------- /pgp/sql/gvcf_variants/allelic-frequency.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2014 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """Run allelic frequency upon one chromosome at a time, appending to the result table.""" 18 | 19 | import string, subprocess 20 | 21 | chromosomes = range(1,23) 22 | chromosomes.extend(['X', 'Y', 'M']) 23 | 24 | with open ("./allelic-frequency-chr1.sql", "r") as myfile: 25 | query=myfile.read().replace('"', '\\"') 26 | 27 | for chrom in chromosomes: 28 | q = string.replace(query, "WHERE contig_name = '1'", "WHERE contig_name = '%s'" % chrom) 29 | cmd = [ 30 | 'bq', 31 | '--project_id', 'google.com:biggene', 32 | '--nosync', 33 | 'query', 34 | '--allow_large_results', 35 | '--append_table', 36 | '--destination_table', 'pgp_analysis_results.gvcf_variants_allelic_frequency', 37 | '--batch', '"' + q + '"'] 38 | print " ".join(cmd) 39 | print subprocess.check_output(" ".join(cmd), 40 | stderr=subprocess.STDOUT, 41 | shell=True) 42 | 43 | -------------------------------------------------------------------------------- /pgp/sql/cgi_variants/allelic-frequency.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2014 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """Run allelic frequency upon one chromosome at a time, appending to the result table.""" 18 | 19 | import string, subprocess 20 | 21 | chromosomes = range(1,23) 22 | chromosomes.extend(['X', 'Y', 'M']) 23 | 24 | with open ("./allelic-frequency-chr1.sql", "r") as myfile: 25 | query=myfile.read().replace('"', '\\"') 26 | 27 | for chrom in chromosomes: 28 | q = string.replace(query, "WHERE chromosome = 'chr1'", "WHERE chromosome = 'chr%s'" % chrom) 29 | cmd = [ 30 | 'bq', 31 | '--project_id', 'google.com:biggene', 32 | '--nosync', 33 | 'query', 34 | '--allow_large_results', 35 | '--append_table', 36 | '--destination_table', 'pgp_analysis_results.cgi_variants_allelic_frequency', 37 | '--batch', '"' + q + '"'] 38 | print " ".join(cmd) 39 | print subprocess.check_output(" ".join(cmd), 40 | stderr=subprocess.STDOUT, 41 | shell=True) 42 | 43 | -------------------------------------------------------------------------------- /1000genomes/data-stories/reproducing-vcfstats/vcfstats-output/stats.shared: -------------------------------------------------------------------------------- 1 | #Shared SNPs Frequency 2 | 0 10 3 | 1 243 4 | 2 103 5 | 3 45 6 | 4 40 7 | 5 27 8 | 6 21 9 | 7 19 10 | 8 24 11 | 9 10 12 | 10 14 13 | 11 17 14 | 12 4 15 | 13 3 16 | 14 8 17 | 15 5 18 | 16 2 19 | 17 5 20 | 18 4 21 | 19 4 22 | 20 3 23 | 21 2 24 | 22 4 25 | 23 1 26 | 24 8 27 | 25 11 28 | 26 5 29 | 27 4 30 | 28 3 31 | 29 4 32 | 30 2 33 | 31 7 34 | 32 7 35 | 33 5 36 | 34 3 37 | 35 1 38 | 39 1 39 | 42 7 40 | 43 2 41 | 44 1 42 | 45 1 43 | 46 1 44 | 47 1 45 | 48 1 46 | 49 1 47 | 50 2 48 | 51 1 49 | 56 1 50 | 57 2 51 | 60 1 52 | 61 2 53 | 62 2 54 | 73 4 55 | 76 1 56 | 80 3 57 | 81 4 58 | 82 1 59 | 94 1 60 | 95 1 61 | 105 1 62 | 107 1 63 | 121 1 64 | 124 1 65 | 125 1 66 | 141 1 67 | 150 1 68 | 166 1 69 | 168 2 70 | 169 1 71 | 170 1 72 | 174 1 73 | 176 1 74 | 195 1 75 | 197 1 76 | 198 2 77 | 200 2 78 | 207 1 79 | 218 1 80 | 222 1 81 | 252 1 82 | 259 1 83 | 268 1 84 | 269 1 85 | 314 1 86 | 409 1 87 | 415 1 88 | 427 1 89 | 431 1 90 | 464 1 91 | 481 1 92 | 492 1 93 | 498 1 94 | 522 6 95 | 523 1 96 | 527 1 97 | 528 1 98 | 532 2 99 | 534 1 100 | 535 2 101 | 536 6 102 | 537 5 103 | 538 7 104 | 539 3 105 | 540 2 106 | 541 1 107 | 549 1 108 | 555 1 109 | 557 1 110 | 561 1 111 | 562 7 112 | 563 5 113 | 566 2 114 | 569 1 115 | 570 2 116 | 573 1 117 | 574 1 118 | 576 1 119 | 577 2 120 | 578 13 121 | 579 8 122 | 580 4 123 | 581 2 124 | 582 6 125 | 584 1 126 | 611 1 127 | 613 1 128 | 685 1 129 | 698 1 130 | 703 1 131 | 704 1 132 | 718 1 133 | 720 4 134 | 721 1 135 | 728 2 136 | 738 2 137 | 740 1 138 | 741 1 139 | 742 1 140 | 743 2 141 | 745 1 142 | 783 1 143 | 1088 1 144 | 1091 1 145 | -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-allelic-frequencies/reproducing-allelic-frequency.sql: -------------------------------------------------------------------------------- 1 | # The following query computes the allelic frequency for BRCA1 variants in the 2 | # 1,000 Genomes dataset and also includes the pre-computed value from the dataset. 3 | SELECT 4 | reference_name, 5 | start, 6 | reference_bases, 7 | alternate_bases, 8 | SUM(ref_count)+SUM(alt_count) AS num_sample_alleles, 9 | SUM(ref_count) AS ref_cnt, 10 | SUM(alt_count) AS alt_cnt, 11 | SUM(ref_count)/(SUM(ref_count)+SUM(alt_count)) AS ref_freq, 12 | SUM(alt_count)/(SUM(ref_count)+SUM(alt_count)) AS alt_freq, 13 | alt_freq_from_1KG 14 | FROM ( 15 | SELECT 16 | reference_name, 17 | start, 18 | reference_bases, 19 | alternate_bases, 20 | alt, 21 | SUM(INTEGER(0 = call.genotype)) WITHIN RECORD AS ref_count, 22 | SUM(INTEGER(alt = call.genotype)) WITHIN RECORD AS alt_count, 23 | alt_freq_from_1KG 24 | FROM 25 | FLATTEN( 26 | FLATTEN(( 27 | SELECT 28 | reference_name, 29 | start, 30 | reference_bases, 31 | alternate_bases, 32 | POSITION(alternate_bases) AS alt, 33 | af AS alt_freq_from_1KG, 34 | call.call_set_name, 35 | call.genotype, 36 | FROM 37 | [genomics-public-data:1000_genomes.variants] 38 | WHERE 39 | reference_name = '17' 40 | AND start BETWEEN 41196311 41 | AND 41277499 42 | AND vt='SNP' 43 | ), 44 | call), 45 | alt)) 46 | GROUP BY 47 | reference_name, 48 | start, 49 | reference_bases, 50 | alternate_bases, 51 | alt, 52 | alt_freq_from_1KG 53 | ORDER BY 54 | reference_name, 55 | start, 56 | reference_bases, 57 | alt, 58 | alternate_bases 59 | -------------------------------------------------------------------------------- /1000genomes/sql/allelic-frequency-by-gender.sql: -------------------------------------------------------------------------------- 1 | # The following query computes the allelic frequency for BRCA1 variants in the 2 | # 1,000 Genomes dataset further classified by gender from the phenotypic data. 3 | SELECT 4 | reference_name, 5 | start, 6 | gender, 7 | reference_bases, 8 | alternate_bases 9 | alt, 10 | SUM(ref_count)+SUM(alt_count) AS num_sample_alleles, 11 | SUM(ref_count) AS ref_cnt, 12 | SUM(alt_count) AS alt_cnt, 13 | SUM(ref_count)/(SUM(ref_count)+SUM(alt_count)) AS ref_freq, 14 | SUM(alt_count)/(SUM(ref_count)+SUM(alt_count)) AS alt_freq, 15 | FROM ( 16 | SELECT 17 | reference_name, 18 | start, 19 | gender, 20 | reference_bases, 21 | alternate_bases, 22 | alt, 23 | SUM(INTEGER(0 = call.genotype)) WITHIN RECORD AS ref_count, 24 | SUM(INTEGER(alt = call.genotype)) WITHIN RECORD AS alt_count 25 | FROM 26 | FLATTEN(FLATTEN(( 27 | SELECT 28 | reference_name, 29 | start, 30 | reference_bases, 31 | alternate_bases, 32 | POSITION(alternate_bases) AS alt, 33 | call.call_set_name, 34 | call.genotype, 35 | FROM 36 | [genomics-public-data:1000_genomes.variants] 37 | WHERE 38 | reference_name = '17' 39 | AND start BETWEEN 41196311 40 | AND 41277499 41 | AND vt='SNP' 42 | ), 43 | call), 44 | alt) AS g 45 | JOIN 46 | [genomics-public-data:1000_genomes.sample_info] p 47 | ON 48 | g.call.call_set_name = p.sample) 49 | GROUP BY 50 | reference_name, 51 | start, 52 | gender, 53 | reference_bases, 54 | alternate_bases, 55 | alt 56 | ORDER BY 57 | reference_name, 58 | start, 59 | gender, 60 | reference_bases, 61 | alt, 62 | alternate_bases 63 | -------------------------------------------------------------------------------- /1000genomes/sql/allelic-frequency-by-ethnicity.sql: -------------------------------------------------------------------------------- 1 | # The following query computes the allelic frequency for BRCA1 variants in the 2 | # 1,000 Genomes dataset further classified by ethnicity from the phenotypic data. 3 | SELECT 4 | reference_name, 5 | start, 6 | population, 7 | reference_bases, 8 | alternate_bases 9 | alt, 10 | SUM(ref_count)+SUM(alt_count) AS num_sample_alleles, 11 | SUM(ref_count) AS ref_cnt, 12 | SUM(alt_count) AS alt_cnt, 13 | SUM(ref_count)/(SUM(ref_count)+SUM(alt_count)) AS ref_freq, 14 | SUM(alt_count)/(SUM(ref_count)+SUM(alt_count)) AS alt_freq, 15 | FROM ( 16 | SELECT 17 | reference_name, 18 | start, 19 | population, 20 | reference_bases, 21 | alternate_bases, 22 | alt, 23 | SUM(INTEGER(0 = call.genotype)) WITHIN RECORD AS ref_count, 24 | SUM(INTEGER(alt = call.genotype)) WITHIN RECORD AS alt_count 25 | FROM 26 | FLATTEN(FLATTEN(( 27 | SELECT 28 | reference_name, 29 | start, 30 | reference_bases, 31 | alternate_bases, 32 | POSITION(alternate_bases) AS alt, 33 | call.call_set_name, 34 | call.genotype, 35 | FROM 36 | [genomics-public-data:1000_genomes.variants] 37 | WHERE 38 | reference_name = '17' 39 | AND start BETWEEN 41196311 40 | AND 41277499 41 | AND vt='SNP' 42 | ), 43 | call), 44 | alt) AS g 45 | JOIN 46 | [genomics-public-data:1000_genomes.sample_info] p 47 | ON 48 | g.call.call_set_name = p.sample) 49 | GROUP BY 50 | reference_name, 51 | start, 52 | population, 53 | reference_bases, 54 | alternate_bases, 55 | alt 56 | ORDER BY 57 | reference_name, 58 | start, 59 | population, 60 | reference_bases, 61 | alt, 62 | alternate_bases 63 | -------------------------------------------------------------------------------- /pgp/provenance/gvcf-expand-mapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2014 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """A mapper for expansion of gVCF data. 18 | """ 19 | 20 | import json 21 | import sys 22 | 23 | from gvcf_expander import GvcfExpander 24 | 25 | 26 | def main(): 27 | """Entry point to the script.""" 28 | 29 | # Basic parsing of command line arguments to allow a filename 30 | # to be passed when running this code in the debugger. 31 | file_handle = sys.stdin 32 | if 2 <= len(sys.argv): 33 | file_handle = open(sys.argv[1], "r") 34 | 35 | expander = GvcfExpander() 36 | 37 | line = file_handle.readline() 38 | while line: 39 | line = line.strip() 40 | if not line: 41 | line = file_handle.readline() 42 | continue 43 | 44 | fields = json.loads(line) 45 | 46 | pairs = expander.map(fields=fields) 47 | for pair in pairs: 48 | emit(pair.k, pair.v) 49 | 50 | line = file_handle.readline() 51 | 52 | 53 | def emit(key, fields): 54 | """Emits a key/value pair to stdout. 55 | 56 | Args: 57 | key: (string) 58 | fields: (dictionary) 59 | 60 | Returns: n/a 61 | 62 | Side Effects: 63 | a VCF line is written to stdout 64 | """ 65 | print "%s\t%s" % (key, json.dumps(fields)) 66 | 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | bigquery-examples 2 | ================= 3 | 4 | The data stories and queries in this repository demonstrate working with genomic data via [Google BigQuery](https://cloud.google.com/bigquery/). All examples are built upon public datasets. 5 | 6 | Have other data stories you would like to see here? Have any data stories you would like to *share*? Have *corrections to the biology* covered in this material? Have query *simplifications* or *speed improvements*? Let us know by [filing an issue](https://github.com/googlegenomics/bigquery-examples/issues) or [contacting us directly](mailto:google-genomics-contact@googlegroups.com). 7 | 8 | Getting Started 9 | ----------------- 10 | 11 | If you are new to BigQuery, start here instead: [Analyze Variants Using BigQuery](https://cloud.google.com/genomics/v1/analyze-variants). 12 | 13 | Otherwise, navigate through the tree of content in this repository. You will find queries, RMarkdown, rendered analyses, and provenance details. 14 | 15 | Loading your own Variant Data into BigQuery 16 | ------------------------------------------- 17 | 18 | After trying these queries on public data, you can [load your own variant data into BigQuery](https://cloud.google.com/genomics/v1/load-variants). 19 | 20 | For other types of data, such as variant annotations, see [Preparing Data for BigQuery](https://cloud.google.com/bigquery/preparing-data-for-bigquery) and also [BigQuery in Practice : Loading Data Sets That are Terabytes and Beyond](https://cloud.google.com/developers/articles/bigquery-in-practice) for more detail. 21 | 22 | The mailing list 23 | ---------------- 24 | 25 | The [Google Genomics Discuss mailing list](https://groups.google.com/forum/#!forum/google-genomics-discuss) is a good 26 | way to sync up with other people who use googlegenomics including the core developers. You can subscribe 27 | by sending an email to ``google-genomics-discuss+subscribe@googlegroups.com`` or just post using 28 | the [web forum page](https://groups.google.com/forum/#!forum/google-genomics-discuss). 29 | -------------------------------------------------------------------------------- /pgp/sql/gvcf_variants/allele-count.sql: -------------------------------------------------------------------------------- 1 | # Count the occurence of each variant allele across all participants in the 2 | # dataset. This returns a large result so be sure to materialize it into a 3 | # table for subsequent use. 4 | # 5 | # Note that the new BigQuery feature of user-defined javascript 6 | # functions is in limited preview. For more info, see 7 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377 8 | SELECT 9 | contig_name, 10 | start_pos, 11 | # This 'bin' can be use in subsequent interval JOINs 12 | INTEGER(FLOOR(start_pos / 5000)) AS bin, 13 | reference_bases, 14 | alternate_bases, 15 | SUM(alternate_allele_count) AS alternate_allele_count, 16 | FROM ( 17 | SELECT contig_name, start_pos, reference_bases, alternate_bases, alt_count 18 | FROM js( 19 | [google.com:biggene:pgp.gvcf_variants], 20 | contig_name, start_pos, reference_bases, alternate_bases, call.genotype, 21 | "[{name: 'contig_name', type: 'string'}, 22 | {name: 'start_pos', type: 'integer'}, 23 | {name: 'reference_bases', type: 'string'}, 24 | {name: 'alternate_bases', type: 'string'}, 25 | {name: 'alternate_allele_count', type: 'integer'}]", 26 | "function(r, emit) { 27 | for(var a in r.alternate_bases) { 28 | var alt_gt = a + 1; 29 | var alt_count = 0; 30 | for(var c in r.call) { 31 | for(var g in r.call[c].genotype) { 32 | if(alt_gt == r.call[c].genotype[g]) { 33 | alt_count++; 34 | } 35 | } 36 | } 37 | // Emit one record per alt 38 | emit({ 39 | contig_name: r.contig_name, 40 | start_pos: r.start_pos, 41 | reference_bases: r.reference_bases, 42 | alternate_bases: r.alternate_bases[a], 43 | alternate_allele_count: alt_count 44 | }); 45 | } 46 | }")) 47 | GROUP EACH BY 48 | contig_name, 49 | start_pos, 50 | bin, 51 | reference_bases, 52 | alternate_bases 53 | -------------------------------------------------------------------------------- /pgp/sql/schema-comparisons/sample-call-counts.sql: -------------------------------------------------------------------------------- 1 | # Sample call counts for the PGP data encoded several different ways. 2 | # NOTE: table pgp.variants was left out of this example since its more trouble 3 | # than its worth to parse the GT field into its components. 4 | SELECT 5 | sample_id, 6 | num_records, 7 | num_variant_alleles, 8 | dataset 9 | FROM 10 | ( 11 | SELECT 12 | sample_id, 13 | COUNT(sample_id) AS num_records, 14 | INTEGER(SUM(allele1_is_variant + allele2_is_variant)) AS num_variant_alleles, 15 | 'cgi_variants' AS dataset 16 | FROM ( 17 | SELECT 18 | sample_id, 19 | allele1Seq != reference 20 | AND allele1Seq != '=' 21 | AND allele1Seq != '?' AS allele1_is_variant, 22 | allele2Seq != reference 23 | AND allele2Seq != '=' 24 | AND allele2Seq != '?' AS allele2_is_variant, 25 | FROM 26 | [google.com:biggene:pgp.cgi_variants] 27 | # Skip the genomes we were unable to convert to VCF/gVCF 28 | OMIT 29 | RECORD IF 30 | sample_id = 'huEDF7DA' 31 | OR sample_id = 'hu34D5B9') 32 | GROUP BY 33 | sample_id), 34 | ( 35 | SELECT 36 | sample_id, 37 | COUNT(sample_id) AS num_records, 38 | INTEGER(SUM(num_variant_alleles)) AS num_variant_alleles, 39 | 'gvcf_variants' AS dataset 40 | FROM ( 41 | SELECT 42 | call.callset_name AS sample_id, 43 | SUM(call.genotype > 0) WITHIN call AS num_variant_alleles, 44 | FROM 45 | [google.com:biggene:pgp.gvcf_variants]) 46 | GROUP BY 47 | sample_id), 48 | ( 49 | SELECT 50 | sample_id, 51 | COUNT(sample_id) AS num_records, 52 | INTEGER(SUM(num_variant_alleles)) AS num_variant_alleles, 53 | 'gvcf_variants_expanded' AS dataset 54 | FROM 55 | ( 56 | SELECT 57 | call.callset_name AS sample_id, 58 | SUM(call.genotype > 0) WITHIN call AS num_variant_alleles, 59 | FROM 60 | [google.com:biggene:test.pgp_gvcf_variants_expanded2]) 61 | GROUP BY 62 | sample_id) 63 | ORDER BY 64 | sample_id, 65 | dataset 66 | -------------------------------------------------------------------------------- /1000genomes/sql/gender-het-hom-ratio.sql: -------------------------------------------------------------------------------- 1 | # The following query uses the homozygous and heterozygous variant counts within 2 | # chromosome X to help determine whether the gender phenotype values are correct 3 | # for the samples. 4 | SELECT 5 | sample_id, 6 | gender, 7 | reference_name, 8 | (hom_AA_count + het_RA_count + hom_RR_count) AS all_callable_sites, 9 | hom_AA_count, 10 | het_RA_count, 11 | hom_RR_count, 12 | (hom_AA_count + het_RA_count) AS all_snvs, 13 | ROUND((het_RA_count/(hom_AA_count + het_RA_count))*1000)/1000 AS perct_het_alt_in_snvs, 14 | ROUND((hom_AA_count/(hom_AA_count + het_RA_count))*1000)/1000 AS perct_hom_alt_in_snvs 15 | FROM 16 | ( 17 | SELECT 18 | reference_name, 19 | sample_id, 20 | SUM(IF(0 = first_allele 21 | AND 0 = second_allele, 22 | 1, 23 | 0)) AS hom_RR_count, 24 | SUM(IF(first_allele = second_allele 25 | AND first_allele > 0, 26 | 1, 27 | 0)) AS hom_AA_count, 28 | SUM(IF((first_allele != second_allele OR second_allele IS NULL) 29 | AND (first_allele > 0 30 | OR second_allele > 0), 31 | 1, 32 | 0)) AS het_RA_count 33 | FROM ( 34 | SELECT 35 | reference_name, 36 | call.call_set_name AS sample_id, 37 | NTH(1, 38 | call.genotype) WITHIN call AS first_allele, 39 | NTH(2, 40 | call.genotype) WITHIN call AS second_allele, 41 | FROM 42 | [genomics-public-data:1000_genomes.variants] 43 | WHERE 44 | reference_name = 'X' 45 | AND vt = 'SNP' 46 | AND start NOT BETWEEN 59999 47 | AND 2699519 48 | AND start NOT BETWEEN 154931042 49 | AND 155260559) 50 | GROUP BY 51 | sample_id, 52 | reference_name 53 | ) AS g 54 | JOIN 55 | [genomics-public-data:1000_genomes.sample_info] p 56 | ON 57 | g.sample_id = p.sample 58 | GROUP BY 59 | sample_id, 60 | gender, 61 | reference_name, 62 | all_callable_sites, 63 | hom_AA_count, 64 | het_RA_count, 65 | hom_RR_count, 66 | all_snvs, 67 | perct_het_alt_in_snvs, 68 | perct_hom_alt_in_snvs 69 | ORDER BY 70 | perct_het_alt_in_snvs DESC, 71 | sample_id 72 | 73 | -------------------------------------------------------------------------------- /pgp/provenance/gvcf-expand-reducer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2014 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """A reducer for expansion of gVCF data. 18 | """ 19 | 20 | import json 21 | import os 22 | import sys 23 | 24 | from gvcf_expander import GvcfExpander 25 | from gvcf_expander import Pair 26 | 27 | FILTER_ENV_KEY = "FILTER_REF_MATCHES" 28 | 29 | 30 | def main(): 31 | """Entry point to the script.""" 32 | 33 | # Basic parsing of command line arguments to allow a filename 34 | # to be passed when running this code in the debugger. 35 | file_handle = sys.stdin 36 | if 2 <= len(sys.argv): 37 | file_handle = open(sys.argv[1], "r") 38 | 39 | if FILTER_ENV_KEY in os.environ: 40 | expander = GvcfExpander(filter_ref_matches=True) 41 | else: 42 | expander = GvcfExpander() 43 | 44 | line = file_handle.readline() 45 | while line: 46 | line = line.strip() 47 | if not line: 48 | line = file_handle.readline() 49 | continue 50 | 51 | (key, value) = line.split("\t") 52 | fields = json.loads(value) 53 | results = expander.reduce(pair=Pair(key, fields)) 54 | 55 | for result in results: 56 | emit(result) 57 | 58 | line = file_handle.readline() 59 | 60 | results = expander.finalize() 61 | 62 | for result in results: 63 | emit(result) 64 | 65 | 66 | def emit(fields): 67 | """Emits a reduced value to stdout. 68 | 69 | Args: 70 | fields: (dict) 71 | 72 | Returns: n/a 73 | 74 | Side Effects: 75 | a value is written to stdout 76 | """ 77 | print "%s" % (json.dumps(fields)) 78 | 79 | 80 | if __name__ == "__main__": 81 | main() 82 | -------------------------------------------------------------------------------- /1000genomes/sql/reproducing-allelic-frequencies/reproducing-allelic-frequency-by-ethnicity.sql: -------------------------------------------------------------------------------- 1 | # The following query computes the allelic frequency for BRCA1 variants in the 2 | # 1,000 Genomes dataset further classified by ethnicity from the phenotypic data 3 | # and also includes the pre-computed value from the dataset. 4 | SELECT 5 | reference_name, 6 | start, 7 | super_population, 8 | reference_bases, 9 | alternate_bases, 10 | SUM(ref_count)+SUM(alt_count) AS num_sample_alleles, 11 | SUM(ref_count) AS sample_allele_ref_cnt, 12 | SUM(alt_count) AS sample_allele_alt_cnt, 13 | SUM(ref_count)/(SUM(ref_count)+SUM(alt_count)) AS ref_freq, 14 | SUM(alt_count)/(SUM(ref_count)+SUM(alt_count)) AS alt_freq, 15 | alt_freq_from_1KG 16 | FROM ( 17 | SELECT 18 | reference_name, 19 | start, 20 | super_population, 21 | reference_bases, 22 | alternate_bases, 23 | alt, 24 | SUM(INTEGER(0 = call.genotype)) WITHIN RECORD AS ref_count, 25 | SUM(INTEGER(alt = call.genotype)) WITHIN RECORD AS alt_count, 26 | CASE 27 | WHEN super_population = 'EAS' 28 | THEN asn_af 29 | WHEN super_population= 'EUR' 30 | THEN eur_af 31 | WHEN super_population = 'AFR' 32 | THEN afr_af 33 | WHEN super_population = 'AMR' 34 | THEN amr_af 35 | END AS alt_freq_from_1KG 36 | FROM 37 | FLATTEN(FLATTEN(( 38 | SELECT 39 | reference_name, 40 | start, 41 | reference_bases, 42 | alternate_bases, 43 | POSITION(alternate_bases) AS alt, 44 | call.call_set_name, 45 | call.genotype, 46 | afr_af, 47 | amr_af, 48 | asn_af, 49 | eur_af, 50 | FROM 51 | [genomics-public-data:1000_genomes.variants] 52 | WHERE 53 | reference_name = '17' 54 | AND start BETWEEN 41196311 55 | AND 41277499 56 | AND vt='SNP' 57 | ), 58 | call), 59 | alt) AS g 60 | JOIN 61 | [genomics-public-data:1000_genomes.sample_info] p 62 | ON 63 | g.call.call_set_name = p.sample) 64 | GROUP BY 65 | reference_name, 66 | start, 67 | super_population, 68 | reference_bases, 69 | alternate_bases, 70 | alt_freq_from_1KG 71 | ORDER BY 72 | reference_name, 73 | start, 74 | super_population 75 | -------------------------------------------------------------------------------- /pgp/sql/gvcf_variants/ti-tv-ratio.sql: -------------------------------------------------------------------------------- 1 | # Compute the Ti/Tv ratio for each participant in the PGP dataset. A user-defined 2 | # function is used here since its difficult in SQL to join the genotype array in 3 | # each call with alternate_bases at the variant level. 4 | # 5 | # Note that the new BigQuery feature of user-defined javascript 6 | # functions is in limited preview. For more info, see 7 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377 8 | SELECT 9 | sample_id, 10 | transitions, 11 | transversions, 12 | transitions/transversions AS titv 13 | FROM ( 14 | SELECT 15 | sample_id, 16 | SUM(IF(mutation IN ('A->G', 17 | 'G->A', 18 | 'C->T', 19 | 'T->C'), 20 | 1, 21 | 0)) AS transitions, 22 | SUM(IF(mutation IN ('A->C', 23 | 'C->A', 24 | 'G->T', 25 | 'T->G', 26 | 'A->T', 27 | 'T->A', 28 | 'C->G', 29 | 'G->C'), 30 | 1, 31 | 0)) AS transversions, 32 | FROM ( 33 | SELECT sample_id, mutation 34 | FROM js( 35 | [google.com:biggene:pgp.gvcf_variants], 36 | reference_bases, alternate_bases, call.callset_name, call.genotype, 37 | "[{name: 'sample_id', type: 'string'}, 38 | {name: 'mutation', type: 'string'}]", 39 | "function(r, emit) { 40 | var hasSNP = false; 41 | var isSNP = [false]; 42 | for(var i in r.alternate_bases) { 43 | if(1 == r.alternate_bases[i].length) { 44 | isSNP[isSNP.length] = true; 45 | hasSNP = true; 46 | } 47 | else { 48 | isSNP[isSNP.length] = false; 49 | } 50 | } 51 | if (hasSNP && 1 == r.reference_bases.length) { 52 | for(var i in r.call) { 53 | for(var j in r.call[i].genotype) { 54 | if(0 < r.call[i].genotype[j] && isSNP[r.call[i].genotype[j]]) { 55 | emit({ 56 | sample_id: r.call[i].callset_name, 57 | mutation: r.reference_bases + '->' + r.alternate_bases[r.call[i].genotype[j] - 1] 58 | }); 59 | } 60 | } 61 | } 62 | } 63 | }")) 64 | GROUP BY 65 | sample_id) 66 | ORDER BY 67 | titv DESC 68 | -------------------------------------------------------------------------------- /pgp/sql/gvcf_variants_expanded/ti-tv-ratio.sql: -------------------------------------------------------------------------------- 1 | # Compute the Ti/Tv ratio for each participant in the PGP dataset. A user-defined 2 | # function is used here since its difficult in SQL to join the genotype array in 3 | # each call with alternate_bases at the variant level. 4 | # 5 | # Note that the new BigQuery feature of user-defined javascript 6 | # functions is in limited preview. For more info, see 7 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377 8 | SELECT 9 | sample_id, 10 | transitions, 11 | transversions, 12 | transitions/transversions AS titv 13 | FROM ( 14 | SELECT 15 | sample_id, 16 | SUM(IF(mutation IN ('A->G', 17 | 'G->A', 18 | 'C->T', 19 | 'T->C'), 20 | 1, 21 | 0)) AS transitions, 22 | SUM(IF(mutation IN ('A->C', 23 | 'C->A', 24 | 'G->T', 25 | 'T->G', 26 | 'A->T', 27 | 'T->A', 28 | 'C->G', 29 | 'G->C'), 30 | 1, 31 | 0)) AS transversions, 32 | FROM ( 33 | SELECT sample_id, mutation 34 | FROM js( 35 | [google.com:biggene:test.pgp_gvcf_variants_expanded], 36 | reference_bases, alternate_bases, call.callset_name, call.genotype, 37 | "[{name: 'sample_id', type: 'string'}, 38 | {name: 'mutation', type: 'string'}]", 39 | "function(r, emit) { 40 | var hasSNP = false; 41 | var isSNP = [false]; 42 | for(var i in r.alternate_bases) { 43 | if(1 == r.alternate_bases[i].length) { 44 | isSNP[isSNP.length] = true; 45 | hasSNP = true; 46 | } 47 | else { 48 | isSNP[isSNP.length] = false; 49 | } 50 | } 51 | if (hasSNP && 1 == r.reference_bases.length) { 52 | for(var i in r.call) { 53 | for(var j in r.call[i].genotype) { 54 | if(0 < r.call[i].genotype[j] && isSNP[r.call[i].genotype[j]]) { 55 | emit({ 56 | sample_id: r.call[i].callset_name, 57 | mutation: r.reference_bases + '->' + r.alternate_bases[r.call[i].genotype[j] - 1] 58 | }); 59 | } 60 | } 61 | } 62 | } 63 | }") 64 | ) 65 | GROUP BY 66 | sample_id) 67 | ORDER BY 68 | titv DESC 69 | -------------------------------------------------------------------------------- /pgp/sql/gvcf_variants/allelic-frequency-brca1-no-udf.sql: -------------------------------------------------------------------------------- 1 | # The following query computes the allelic frequency for BRCA1 variants in the 2 | # PGP dataset _without_ using a user-defined function. 3 | # 4 | # Since without UDFs we cannot _count the other reference calls just assume 5 | # the total number of alleles IS number of samples times 2 (thereby losing the 6 | # distinction _between reference calls _and no-calls unfortunately) 7 | SELECT 8 | contig_name, 9 | start_pos, 10 | reference_bases, 11 | alt AS allele, 12 | (174 * 2) AS num_alleles_called, 13 | ROUND(alt_allele_count / (174 * 2), 14 | 4) AS freq, 15 | FROM ( 16 | SELECT 17 | contig_name, 18 | start_pos, 19 | reference_bases, 20 | alt, 21 | SUM(ref_allele_count) AS ref_allele_count, 22 | SUM(alt_allele_count) AS alt_allele_count, 23 | SUM(other_alt_allele_count) AS other_alt_allele_count, 24 | FROM ( 25 | SELECT 26 | contig_name, 27 | start_pos, 28 | reference_bases, 29 | NTH(1, 30 | alternate_bases) WITHIN RECORD AS alt, 31 | SUM(IF(0 = call.genotype, 32 | 1, 33 | 0)) WITHIN RECORD AS ref_allele_count, 34 | SUM(IF(1 = call.genotype, 35 | 1, 36 | 0)) WITHIN RECORD AS alt_allele_count, 37 | SUM(IF(0 != call.genotype 38 | AND 1 != call.genotype, 39 | 1, 40 | 0)) WITHIN RECORD AS other_alt_allele_count, 41 | FROM 42 | [google.com:biggene:pgp.gvcf_variants] 43 | WHERE 44 | reference_bases != 'N' 45 | AND contig_name = '17' 46 | AND start_pos BETWEEN 41196312 47 | AND 41277500 48 | ), 49 | ( 50 | SELECT 51 | contig_name, 52 | start_pos, 53 | reference_bases, 54 | NTH(2, 55 | alternate_bases) WITHIN RECORD AS alt, 56 | SUM(IF(0 = call.genotype, 57 | 1, 58 | 0)) WITHIN RECORD AS ref_allele_count, 59 | SUM(IF(2 = call.genotype, 60 | 1, 61 | 0)) WITHIN RECORD AS alt_allele_count, 62 | SUM(IF(0 != call.genotype 63 | AND 2 != call.genotype, 64 | 1, 65 | 0)) WITHIN RECORD AS other_alt_allele_count, 66 | FROM 67 | [google.com:biggene:pgp.gvcf_variants] 68 | WHERE 69 | reference_bases != 'N' 70 | AND contig_name = '17' 71 | AND start_pos BETWEEN 41196312 72 | AND 41277500 73 | ) 74 | WHERE 75 | alt IS NOT NULL 76 | GROUP BY 77 | contig_name, 78 | start_pos, 79 | reference_bases, 80 | alt 81 | ) -------------------------------------------------------------------------------- /1000genomes/sql/shared-variant-counts-by-ethnicity.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | -- 3 | -- We'd like to see how the members of each super population share variation. 4 | -- 5 | -- Let's generate a table where the records indicate: 6 | -- 7 | -- For the variants that appear in a given super-population: 8 | -- how many variants are singletons (not shared)? 9 | -- how many variants are shared by exactly 2 individuals? 10 | -- how many variants are shared by exactly 3 individuals? 11 | -- etc ... 12 | -- how many variants are shared by all members of the super population? 13 | -- 14 | -- The variants and counts are further partitioned by whether the variant is common or rare. 15 | -- 16 | WITH 17 | population_counts AS ( 18 | SELECT 19 | super_population, 20 | COUNT(population) AS super_population_count 21 | FROM 22 | `genomics-public-data.1000_genomes.sample_info` 23 | WHERE 24 | In_Phase1_Integrated_Variant_Set = TRUE 25 | GROUP BY 26 | super_population), 27 | -- 28 | autosome_calls AS ( 29 | SELECT 30 | reference_name, 31 | start, 32 | `end`, 33 | reference_bases, 34 | alternate_bases[ORDINAL(1)] AS alt, -- 1000 Genomes is biallelic. 35 | vt, 36 | af IS NOT NULL 37 | AND af >= 0.05 AS is_common_variant, 38 | call.call_set_name, 39 | super_population 40 | FROM 41 | `genomics-public-data.1000_genomes.variants` AS v, v.call AS call 42 | JOIN 43 | `genomics-public-data.1000_genomes.sample_info` AS p 44 | ON 45 | call.call_set_name = p.sample 46 | WHERE 47 | reference_name NOT IN ("X", "Y", "MT") 48 | AND EXISTS (SELECT gt FROM UNNEST(call.genotype) gt WHERE gt > 0)), 49 | -- 50 | super_population_autosome_variants AS ( 51 | SELECT 52 | reference_name, 53 | start, 54 | `end`, 55 | reference_bases, 56 | alt, 57 | vt, 58 | super_population, 59 | is_common_variant, 60 | COUNT(call_set_name) AS num_samples 61 | FROM 62 | autosome_calls 63 | GROUP BY 64 | reference_name, 65 | start, 66 | `end`, 67 | reference_bases, 68 | alt, 69 | vt, 70 | super_population, 71 | is_common_variant ) 72 | -- 73 | -- 74 | SELECT 75 | p.super_population AS super_population, 76 | super_population_count, 77 | is_common_variant, 78 | num_samples, 79 | num_samples / super_population_count AS percent_samples, 80 | COUNT(1) AS num_variants_shared_by_this_many_samples 81 | FROM 82 | super_population_autosome_variants AS v 83 | JOIN population_counts AS p 84 | ON 85 | v.super_population = p.super_population 86 | GROUP BY 87 | super_population, 88 | super_population_count, 89 | is_common_variant, 90 | num_samples, 91 | percent_samples 92 | ORDER BY 93 | num_samples, 94 | super_population, 95 | is_common_variant 96 | -------------------------------------------------------------------------------- /pgp/sql/schema-comparisons/missingness-brca1.sql: -------------------------------------------------------------------------------- 1 | # Missingness rate for variants within BRCA1. 2 | SELECT 3 | vars.contig_name AS contig_name, 4 | vars.start_pos AS start_pos, 5 | reference_bases, 6 | variant_called_count, 7 | SUM(refs.called_count) AS reference_called_count, 8 | variant_called_count + SUM(refs.called_count) AS num_alleles_called_for_position, 9 | 1 - ((variant_called_count + SUM(refs.called_count))/(172*2)) AS missingness_rate 10 | FROM ( 11 | # _JOIN our variant sample counts with the corresponding reference-matching blocks 12 | SELECT 13 | vars.contig_name, 14 | vars.start_pos, 15 | refs.start_pos, 16 | vars.end_pos, 17 | refs.END, 18 | reference_bases, 19 | variant_called_count, 20 | refs.called_count 21 | FROM ( 22 | # Constrain the left hand side of the _JOIN to reference-matching blocks 23 | SELECT 24 | contig_name, 25 | start_pos, 26 | END, 27 | IF(alternate_bases IS NULL, 28 | FALSE, 29 | TRUE) AS is_variant_call, 30 | SUM(call.genotype >= 0) WITHIN RECORD AS called_count, 31 | FROM 32 | [google.com:biggene:pgp.gvcf_variants] 33 | WHERE 34 | contig_name = '17' 35 | HAVING 36 | is_variant_call = FALSE) AS refs 37 | JOIN ( 38 | # Constrain the right hand side of the _JOIN to variants 39 | # _GROUP our variant sample counts together since a single SNP may be IN more than 40 | # one row due 1 / 2 genotypes 41 | SELECT 42 | contig_name, 43 | start_pos, 44 | end_pos, 45 | reference_bases, 46 | SUM(called_count) AS variant_called_count, 47 | FROM ( 48 | # _LIMIT the query to SNPs _ON chromosome 17 WITHIN BRCA1 49 | SELECT 50 | contig_name, 51 | start_pos, 52 | end_pos, 53 | reference_bases, 54 | LENGTH(reference_bases) AS ref_len, 55 | MIN(LENGTH(alternate_bases)) WITHIN RECORD AS alt_len, 56 | IF(alternate_bases IS NULL, 57 | FALSE, 58 | TRUE) AS is_variant_call, 59 | SUM(call.genotype >= 0) WITHIN RECORD AS called_count, 60 | FROM 61 | [google.com:biggene:pgp.gvcf_variants] 62 | WHERE 63 | contig_name = '17' 64 | AND start_pos BETWEEN 41196312 65 | AND 41277500 66 | HAVING 67 | ref_len = 1 68 | AND alt_len = 1 69 | AND is_variant_call) 70 | GROUP BY 71 | contig_name, 72 | start_pos, 73 | end_pos, 74 | reference_bases) AS vars 75 | # The _JOIN criteria IS complicated since we are trying to see if a SNP overlaps an interval 76 | ON 77 | vars.contig_name = refs.contig_name 78 | WHERE 79 | refs.start_pos <= vars.start_pos 80 | AND refs.END >= vars.end_pos 81 | ) 82 | GROUP BY 83 | contig_name, 84 | start_pos, 85 | reference_bases, 86 | variant_called_count 87 | -------------------------------------------------------------------------------- /pgp/sql/gvcf_variants_expanded/allelic-frequency.sql: -------------------------------------------------------------------------------- 1 | # This is busted. It over counts ref calls due in the GROUP BY operation. It 2 | # would work if we grouped all of the same variant into the same row prior to 3 | # loading to BigQuery because then we would not need the GROUP BY operation. 4 | # 5 | # Note that the new BigQuery feature of user-defined javascript 6 | # functions is in limited preview. For more info, see 7 | # https://www.youtube.com/watch?v=GrD7ymUPt3M#t=1377 8 | SELECT 9 | contig_name, 10 | start_pos, 11 | reference_bases, 12 | alternate_bases, 13 | ref_count + alt_count + other_count AS num_sample_alleles, 14 | alt_count/(ref_count + alt_count + other_count) AS alt_freq, 15 | FROM ( 16 | SELECT 17 | contig_name, 18 | start_pos, 19 | reference_bases, 20 | alternate_bases, 21 | SUM(alt_count) AS alt_count, 22 | SUM(ref_count) AS ref_count, 23 | SUM(other_count) AS other_count, 24 | FROM ( 25 | SELECT contig_name, start_pos, reference_bases, alternate_bases, alt_count, ref_count, other_count 26 | FROM js( 27 | [google.com:biggene:test.pgp_gvcf_variants_expanded], 28 | contig_name, start_pos, reference_bases, alternate_bases, call.genotype, 29 | "[{name: 'contig_name', type: 'string'}, 30 | {name: 'start_pos', type: 'integer'}, 31 | {name: 'reference_bases', type: 'string'}, 32 | {name: 'alternate_bases', type: 'string'}, 33 | {name: 'alt_count', type: 'integer'}, 34 | {name: 'ref_count', type: 'integer'}, 35 | {name: 'other_count', type: 'integer'}]", 36 | "function(r, emit) { 37 | for(var a in r.alternate_bases) { 38 | var alt_gt = a + 1; 39 | var ref_count = 0; 40 | var alt_count = 0; 41 | var other_count = 0; 42 | for(var c in r.call) { 43 | for(var g in r.call[c].genotype) { 44 | if(0 > r.call[c].genotype[g]) { 45 | // Don't count no-calls 46 | continue; 47 | } else if (0 == r.call[c].genotype[g]) { 48 | ref_count++; 49 | } else if (alt_gt == r.call[c].genotype[g]) { 50 | alt_count++; 51 | } else { 52 | other_count++; 53 | } 54 | } 55 | } 56 | // Emit one record per alt 57 | emit({ 58 | contig_name: r.contig_name, 59 | start_pos: r.start_pos, 60 | reference_bases: r.reference_bases, 61 | alternate_bases: r.alternate_bases[a], 62 | alt_count: alt_count, 63 | ref_count: ref_count, 64 | other_count: other_count 65 | }); 66 | } 67 | }")) 68 | GROUP EACH BY 69 | contig_name, 70 | start_pos, 71 | reference_bases, 72 | alternate_bases) 73 | -------------------------------------------------------------------------------- /pgp/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | Personal Genomes Project 18 | ================= 19 | 20 | ### Additional Resources 21 | * [Schema](https://bigquery.cloud.google.com/table/google.com:biggene:pgp.variants?pli=1) 22 | * [Provenance](./provenance) 23 | * [Data Stories](./data-stories) such as 24 | * [Comparing PGP to 1000 Genomes](./data-stories/comparing-pgp-to-1000genomes) 25 | * [Issues with the Variant-Centric Approach](./data-stories/issues-with-the-variant-centric-approach) 26 | 27 | 28 | 29 | 30 | **See [PGP Public data](http://googlegenomics.readthedocs.org/en/latest/use_cases/discover_public_data/pgp_public_data.html#bigquery-pgp-tables) for provenance details of the most recent import of the PGP data which has the up-to-date schema.** The other tables you see here comprise a variety of schema experiments. Some of the column names for common data may differ from those of your own variants data exported to BigQuery. 31 | 32 | Here is an initial query joining the variant data with the phenotypic data. See the [phenotypes schema](https://bigquery.cloud.google.com/table/google.com:biggene:pgp.phenotypes?pli=1) for more detail. 33 | 34 | 35 | ``` 36 | # Compute sample count by gender 37 | SELECT 38 | Sex_Gender, 39 | COUNT(1) AS cnt 40 | FROM 41 | ( 42 | SELECT 43 | call.callset_name, 44 | Sex_Gender 45 | FROM 46 | FLATTEN([google.com:biggene:pgp.variants], 47 | call) AS var 48 | JOIN 49 | [google.com:biggene:pgp.phenotypes] AS pheno 50 | ON 51 | pheno.Participant = var.call.callset_name 52 | GROUP BY 53 | call.callset_name, 54 | Sex_Gender) 55 | GROUP BY 56 | Sex_Gender Running query: RUNNING 2.1s 57 | ``` 58 | 59 | 60 | 61 |
| Sex_Gender | cnt |
|---|---|
| Female | 53 |
| Male | 112 |
| 6 |
69 |
--------------------------------------------------------------------------------
/pgp/provenance/cgi-header-mapper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright 2014 Google Inc. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | """Count header values found within CGI files.
18 |
19 | Assumptions:
20 | - one sample per input file
21 |
22 | This script can be run standalone:
23 | cat masterVarBeta-GS000010426-ASM.tsv | ./cgi-header-mapper.py
24 |
25 | Or via the debugger:
26 | python -mpdb ./cgi-header-mapper.py masterVarBeta-GS000010426-ASM.tsv
27 |
28 | It can also be run as a Hadoop Streaming job:
29 | hadoop jar /path/to/your/hadoop-streaming-*.jar -input inputpath \
30 | -mapper cgi-header-mapper.py -file cgi-header-mapper.py \
31 | -reducer aggregate -output outputpath
32 |
33 | See also https://cloud.google.com/hadoop/
34 | """
35 |
36 | import os
37 | import re
38 | import sys
39 |
40 | # Constants
41 | INPUT_FILE_KEY = "map_input_file"
42 | DUPLICATE_GENOME = "gs://pgp-harvard-data-public/hu34D5B9/GS000012763-DID/GS000010327-ASM/GS01173-DNA_C07/ASM/masterVarBeta-GS000010327-ASM.tsv.bz2"
43 |
44 |
45 | def generate_long_count_token(value):
46 | """Formats result for the Hadoop Aggregate package.
47 |
48 | For more detail, see
49 | http://hadoop.apache.org/docs/r1.2.1/streaming.html#Hadoop+Aggregate+Package
50 |
51 | Args:
52 | value: (string) the value to emit
53 |
54 | Returns:
55 | (string) the formatted value
56 | """
57 | return "LongValueSum:" + value + "\t" + "1"
58 |
59 |
60 | def main():
61 | """Entry point to the script."""
62 |
63 | # Basic parsing of command line arguments to allow a filename
64 | # to be passed when running this code in the debugger.
65 | file_handle = sys.stdin
66 | if 2 <= len(sys.argv):
67 | path = sys.argv[1]
68 | file_handle = open(path, "r")
69 | else:
70 | path = os.environ[INPUT_FILE_KEY]
71 | print >> sys.stderr, path
72 | print >> sys.stderr, str(os.environ)
73 |
74 | line = file_handle.readline()
75 | while line:
76 | line = line.rstrip("\n")
77 |
78 | if DUPLICATE_GENOME == path:
79 | # hu34D5B9 was sequenced twice, skip the older genome
80 | pass
81 | elif not line:
82 | # This is a blank line, skip it
83 | pass
84 | elif "#" == line[0]:
85 | # This is a header line, count it
86 | print generate_long_count_token(re.sub("\t", " ", line))
87 |
88 | line = file_handle.readline()
89 |
90 | if __name__ == "__main__":
91 | main()
92 |
--------------------------------------------------------------------------------
/pgp/README.Rmd:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 | Personal Genomes Project
18 | =================
19 |
20 | ### Additional Resources
21 | * [Schema](https://bigquery.cloud.google.com/table/google.com:biggene:pgp.variants?pli=1)
22 | * [Provenance](./provenance)
23 | * [Data Stories](./data-stories) such as
24 | * [Comparing PGP to 1000 Genomes](./data-stories/comparing-pgp-to-1000genomes)
25 | * [Issues with the Variant-Centric Approach](./data-stories/issues-with-the-variant-centric-approach)
26 |
27 |
28 | ```{r init, echo=FALSE, message=FALSE, warning=FALSE, comment=NA}
29 | require(bigrquery)
30 | require(ggplot2)
31 | require(dplyr)
32 | require(xtable)
33 | require(testthat)
34 | project <- "google.com:biggene" # put your projectID here
35 | DisplayAndDispatchQuery <- function(queryUri) {
36 | querySql <- readChar(queryUri, nchars=1e6)
37 | cat(querySql)
38 | query_exec(querySql, project)
39 | }
40 | ```
41 |
42 | **See [PGP Public data](http://googlegenomics.readthedocs.org/en/latest/use_cases/discover_public_data/pgp_public_data.html#bigquery-pgp-tables) for provenance details of the most recent import of the PGP data which has the up-to-date schema.** The other tables you see here comprise a variety of schema experiments. Some of the column names for common data may differ from those of your own variants data exported to BigQuery.
43 |
44 | Here is an initial query joining the variant data with the phenotypic data. See the [phenotypes schema](https://bigquery.cloud.google.com/table/google.com:biggene:pgp.phenotypes?pli=1) for more detail.
45 |
46 | ```{r echo=FALSE, message=FALSE, warning=FALSE, error=FALSE, comment=NA}
47 | result <- DisplayAndDispatchQuery("./sql/gender-count.sql")
48 | ```
49 |
50 | ```{r echo=FALSE, message=FALSE, warning=FALSE, comment=NA, results="asis"}
51 | print(xtable(head(result)), type="html", include.rownames=F)
52 | ```
53 |
54 | ```{r gender, echo=FALSE, message=FALSE, warning=FALSE, comment=NA, fig.align="center", fig.width=6, fig.height=4}
55 | result$Sex_Gender[is.na(result$Sex_Gender)] <- "Unknown"
56 | ggplot(result, aes(x="", y=cnt, fill=Sex_Gender)) +
57 | geom_bar(width=1, stat="identity") +
58 | coord_polar("y", start=pi / 3) +
59 | xlab("") + ylab("Gender Count")
60 | ```
61 |
--------------------------------------------------------------------------------
/1000genomes/data-stories/reproducing-vcfstats/vcfstats-output/stats.private:
--------------------------------------------------------------------------------
1 | #Private SNPs Sample
2 | 1 HG00106
3 | 1 HG00109
4 | 1 HG00143
5 | 3 HG00152
6 | 1 HG00160
7 | 1 HG00186
8 | 1 HG00231
9 | 1 HG00234
10 | 2 HG00235
11 | 1 HG00236
12 | 2 HG00237
13 | 1 HG00244
14 | 1 HG00246
15 | 1 HG00247
16 | 1 HG00249
17 | 2 HG00329
18 | 1 HG00342
19 | 1 HG00355
20 | 1 HG00367
21 | 2 HG00384
22 | 1 HG00422
23 | 1 HG00442
24 | 1 HG00452
25 | 1 HG00475
26 | 1 HG00534
27 | 1 HG00556
28 | 1 HG00560
29 | 2 HG00611
30 | 1 HG00641
31 | 1 HG00654
32 | 1 HG00656
33 | 1 HG00671
34 | 1 HG00693
35 | 1 HG00699
36 | 1 HG00701
37 | 1 HG00707
38 | 1 HG00708
39 | 3 HG00737
40 | 1 HG00740
41 | 1 HG01048
42 | 1 HG01060
43 | 1 HG01069
44 | 2 HG01108
45 | 2 HG01124
46 | 1 HG01148
47 | 1 HG01149
48 | 1 HG01171
49 | 1 HG01191
50 | 1 HG01272
51 | 1 HG01356
52 | 1 HG01375
53 | 1 HG01378
54 | 3 HG01390
55 | 1 HG01456
56 | 1 HG01462
57 | 1 HG01465
58 | 1 HG01488
59 | 1 HG01489
60 | 1 HG01491
61 | 1 HG01495
62 | 8 HG01551
63 | 1 HG01624
64 | 1 NA07051
65 | 1 NA12342
66 | 2 NA12383
67 | 2 NA12400
68 | 1 NA18507
69 | 2 NA18510
70 | 1 NA18519
71 | 1 NA18523
72 | 1 NA18527
73 | 1 NA18528
74 | 1 NA18532
75 | 1 NA18534
76 | 2 NA18535
77 | 1 NA18536
78 | 1 NA18539
79 | 1 NA18548
80 | 1 NA18557
81 | 2 NA18562
82 | 2 NA18573
83 | 1 NA18596
84 | 1 NA18605
85 | 1 NA18606
86 | 3 NA18616
87 | 1 NA18622
88 | 1 NA18628
89 | 1 NA18631
90 | 1 NA18634
91 | 1 NA18638
92 | 1 NA18641
93 | 1 NA18856
94 | 3 NA18868
95 | 1 NA18907
96 | 1 NA18924
97 | 2 NA18939
98 | 3 NA18956
99 | 1 NA18957
100 | 2 NA18962
101 | 1 NA18976
102 | 1 NA18990
103 | 1 NA18992
104 | 1 NA18995
105 | 1 NA19002
106 | 1 NA19005
107 | 1 NA19020
108 | 1 NA19046
109 | 1 NA19056
110 | 1 NA19059
111 | 2 NA19063
112 | 1 NA19068
113 | 1 NA19074
114 | 2 NA19077
115 | 1 NA19084
116 | 2 NA19087
117 | 1 NA19093
118 | 14 NA19096
119 | 1 NA19099
120 | 1 NA19131
121 | 1 NA19147
122 | 1 NA19149
123 | 1 NA19150
124 | 1 NA19197
125 | 1 NA19236
126 | 1 NA19248
127 | 1 NA19316
128 | 1 NA19318
129 | 1 NA19319
130 | 1 NA19324
131 | 1 NA19332
132 | 2 NA19346
133 | 1 NA19351
134 | 1 NA19360
135 | 1 NA19372
136 | 1 NA19395
137 | 1 NA19398
138 | 1 NA19401
139 | 1 NA19437
140 | 1 NA19439
141 | 1 NA19440
142 | 1 NA19457
143 | 1 NA19463
144 | 1 NA19467
145 | 1 NA19474
146 | 1 NA19661
147 | 1 NA19701
148 | 2 NA19704
149 | 1 NA19716
150 | 1 NA19717
151 | 1 NA19719
152 | 1 NA19734
153 | 1 NA19740
154 | 1 NA19749
155 | 1 NA19752
156 | 1 NA19755
157 | 1 NA19758
158 | 1 NA19761
159 | 1 NA19762
160 | 1 NA19774
161 | 2 NA19780
162 | 1 NA19782
163 | 1 NA19819
164 | 1 NA19901
165 | 1 NA19904
166 | 1 NA19921
167 | 1 NA20294
168 | 1 NA20296
169 | 2 NA20322
170 | 1 NA20342
171 | 1 NA20344
172 | 1 NA20351
173 | 1 NA20505
174 | 1 NA20506
175 | 2 NA20521
176 | 1 NA20581
177 | 1 NA20582
178 | 1 NA20589
179 | 1 NA20756
180 | 1 NA20760
181 | 1 NA20768
182 | 1 NA20792
183 | 1 NA20796
184 | 1 NA20803
185 | 1 NA20805
186 | 1 NA20809
187 | 2 NA20819
188 | 1 NA20826
189 |
--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
1 | How to contribute
2 | ===================================
3 |
4 | First of all, thank you for contributing!
5 |
6 | The mailing list
7 | ----------------
8 |
9 | For general questions or if you are having trouble getting started, try the
10 | `Google Genomics Discuss mailing list