Example Notebook

This notebook is an example that shows how to use VariantSpark with the Hail v0.2 library and compares the results with PCA and logistic regression.

For demonstration purposes this notebook uses the sample dataset available in ViGWAS.

We're always looking for suggestions and feedback. Please click here for a 1 minute survey

User Block

In [1]:
S3_Path='s3://example-bucket-13vqm0cs3xkcy/notebook/'

## Some configs
numCPU = 32
memory = '60G'
numPartitions = numCPU*4

Environment initialization

In [2]:
## Environment init

import os
from pyspark import SparkContext
sc = SparkContext()

import hail as hl
import varspark.hail as vshl
vshl.init(sc=sc)
using variant-spark jar at '/home/hadoop/biospark/lib/python3.6/site-packages/varspark/jars/variant-spark_2.11-0.3.0-SNAPSHOT-all.jar'
using hail jar at '/home/hadoop/biospark/lib/python3.6/site-packages/hail/hail-all-spark.jar'
using hail jar at /home/hadoop/biospark/lib/python3.6/site-packages/hail/hail-all-spark.jar
Running on Apache Spark version 2.4.4
SparkUI available at http://ip-10-0-0-95.ap-southeast-2.compute.internal:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.16-6da0d3571629
LOGGING: writing to /home/hadoop/hail-20200304-0226-0.2.16-6da0d3571629.log
In [3]:
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, FactorRange, LabelSet, Label
from bokeh.transform import factor_cmap
from bokeh.palettes import d3
from bokeh.core.properties import value
from bokeh.embed import file_html
from bokeh.resources import CDN
from bokeh.layouts import gridplot
from bokeh.models.mappers import CategoricalColorMapper

from pprint import pprint
output_notebook()

import re
import numpy as np
import math as math
import sys
import operator
from collections import OrderedDict
import subprocess
from itertools import cycle
import shutil
Loading BokehJS ...

Load VCF files

In [4]:
mt = hl.import_vcf(path=S3_Path+'sample_input/V1.vcf.bgz',
                   skip_invalid_loci=True,
                   min_partitions=int(numPartitions))

Sample Annotation Data Analysis

In [5]:
Annot = hl.import_table(S3_Path+'sample_input/hipster.csv',
                        impute=True, delimiter=',').key_by('Sample')
2020-03-04 02:26:38 Hail: INFO: Reading table to impute column types
2020-03-04 02:26:40 Hail: INFO: Finished type imputation
  Loading column 'Sample' as type 'str' (imputed)
  Loading column 'Population' as type 'str' (imputed)
  Loading column 'isFemale' as type 'bool' (imputed)
  Loading column 'isCase' as type 'bool' (imputed)
  Loading column 'Hipster' as type 'int32' (imputed)

Annotate dataset with sample annotation

In [6]:
mt = mt.annotate_cols(pheno = Annot[mt.s])

PCA analysis

In [7]:
eigenvalues, pcs, loadings = hl.hwe_normalized_pca(mt.GT, k=2)
mt = mt.annotate_cols(pcs = pcs[mt.s].scores)
2020-03-04 02:26:42 Hail: INFO: Coerced sorted dataset
2020-03-04 02:26:45 Hail: INFO: hwe_normalized_pca: running PCA using 9967 variants.
2020-03-04 02:26:48 Hail: INFO: pca: running PCA with 2 components...
In [8]:
p = hl.plot.scatter(pcs.scores[0], pcs.scores[1],
                    label=mt.cols()[pcs.s].pheno.Hipster,
                    title='PCA Case/Control', xlabel='PC1', ylabel='PC2', collect_all=True)
show(p)
2020-03-04 02:26:57 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'
2020-03-04 02:26:57 Hail: INFO: Coerced sorted dataset
2020-03-04 02:26:58 Hail: INFO: Coerced sorted dataset

Logistic Regression

In [9]:
covariates = [mt.pheno.isFemale, mt.pcs[0], mt.pcs[1]]

result = hl.logistic_regression_rows(test ='wald', 
                                          y=mt.pheno.isCase,
                                          x=mt.GT.n_alt_alleles(),
                                          covariates=covariates)

mt = mt.annotate_rows( logreg = result[mt.locus, mt.alleles])
2020-03-04 02:26:59 Hail: WARN: logistic_regression_rows: model appears to have no intercept covariate.
    To include an intercept, add 1.0 to the list of covariates.
2020-03-04 02:27:00 Hail: INFO: logistic_regression_rows: running wald on 2504 samples for response variable y,
    with input variable x, and 3 additional covariates...
In [10]:
p = hl.plot.manhattan(result.p_value)
show(p)

Variant-Spark RandomForest

In [11]:
rf_model = vshl.random_forest_model(y=mt.pheno.isCase, x=mt.GT.n_alt_alleles(),
                                    seed = 13, mtry_fraction = 0.1,
                                    min_node_size = 10, max_depth = 15)

rf_model.fit_trees(n_trees=100, batch_size=25)

impTable = rf_model.variable_importance()

mt = mt.annotate_rows(vs_score = impTable[mt.locus, mt.alleles].importance)
2020-03-04 02:27:11 Hail: INFO: Loaded 9994 variables
2020-03-04 02:27:44 Hail: INFO: Coerced sorted dataset
In [12]:
mt = mt.annotate_rows(vs_stats = mt.aggregate_rows(hl.agg.stats(mt['vs_score'])))
mt = mt.annotate_rows(z_score = (mt['vs_score'] - mt.vs_stats.mean)/mt.vs_stats.stdev)
mt = mt.annotate_rows(vs_score_converted = 10** -mt.z_score)
title = 'Variant-Spark Manhattan plot'
hover_fields = {'rsid': mt.rsid, 'vs_score': mt.vs_score}
p = hl.plot.manhattan(pvals=mt.vs_score_converted, hover_fields=hover_fields, title=title)
p.yaxis.axis_label = 'Z score of importantce score by VS'
show(p)

Describe matrix Table

In [13]:
mt.describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'pheno': struct {
        Population: str, 
        isFemale: bool, 
        isCase: bool, 
        Hipster: int32
    }
    'pcs': array<float64>
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        CIEND: array<int32>, 
        CIPOS: array<int32>, 
        CS: str, 
        END: int32, 
        IMPRECISE: bool, 
        MC: array<str>, 
        MEINFO: array<str>, 
        MEND: int32, 
        MLEN: int32, 
        MSTART: int32, 
        SVLEN: array<int32>, 
        SVTYPE: str, 
        TSD: str, 
        AC: array<int32>, 
        AF: array<float64>, 
        NS: int32, 
        AN: int32, 
        EAS_AF: array<float64>, 
        EUR_AF: array<float64>, 
        AFR_AF: array<float64>, 
        AMR_AF: array<float64>, 
        SAS_AF: array<float64>, 
        DP: int32, 
        AA: str, 
        VT: array<str>, 
        EX_TARGET: bool, 
        MULTI_ALLELIC: bool
    }
    'logreg': struct {
        beta: float64, 
        standard_error: float64, 
        z_stat: float64, 
        p_value: float64, 
        fit: struct {
            n_iterations: int32, 
            converged: bool, 
            exploded: bool
        }
    }
    'vs_score': float64
    'vs_stats': struct {
        mean: float64, 
        stdev: float64, 
        min: float64, 
        max: float64, 
        n: int32, 
        sum: float64
    }
    'z_score': float64
    'vs_score_converted': float64
----------------------------------------
Entry fields:
    'GT': call
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'PL': array<int32>
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------

Write MT to S3

In [14]:
mt.write(S3_Path+'sample_output/dataset.mt',overwrite=True)
2020-03-04 02:27:55 Hail: INFO: wrote matrix table with 9994 rows and 2504 columns in 128 partitions to s3://example-bucket-13vqm0cs3xkcy/notebook/sample_output/dataset.mt