S3_Path='s3://example-bucket-13vqm0cs3xkcy/notebook/'
## Some configs
numCPU = 32
memory = '60G'
numPartitions = numCPU*4
## Environment init
import os
from pyspark import SparkContext
sc = SparkContext()
import hail as hl
import varspark.hail as vshl
vshl.init(sc=sc)
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, FactorRange, LabelSet, Label
from bokeh.transform import factor_cmap
from bokeh.palettes import d3
from bokeh.core.properties import value
from bokeh.embed import file_html
from bokeh.resources import CDN
from bokeh.layouts import gridplot
from bokeh.models.mappers import CategoricalColorMapper
from pprint import pprint
output_notebook()
import re
import numpy as np
import math as math
import sys
import operator
from collections import OrderedDict
import subprocess
from itertools import cycle
import shutil
mt = hl.import_vcf(path=S3_Path+'sample_input/V1.vcf.bgz',
skip_invalid_loci=True,
min_partitions=int(numPartitions))
Annot = hl.import_table(S3_Path+'sample_input/hipster.csv',
impute=True, delimiter=',').key_by('Sample')
mt = mt.annotate_cols(pheno = Annot[mt.s])
eigenvalues, pcs, loadings = hl.hwe_normalized_pca(mt.GT, k=2)
mt = mt.annotate_cols(pcs = pcs[mt.s].scores)
p = hl.plot.scatter(pcs.scores[0], pcs.scores[1],
label=mt.cols()[pcs.s].pheno.Hipster,
title='PCA Case/Control', xlabel='PC1', ylabel='PC2', collect_all=True)
show(p)
covariates = [mt.pheno.isFemale, mt.pcs[0], mt.pcs[1]]
result = hl.logistic_regression_rows(test ='wald',
y=mt.pheno.isCase,
x=mt.GT.n_alt_alleles(),
covariates=covariates)
mt = mt.annotate_rows( logreg = result[mt.locus, mt.alleles])
p = hl.plot.manhattan(result.p_value)
show(p)
rf_model = vshl.random_forest_model(y=mt.pheno.isCase, x=mt.GT.n_alt_alleles(),
seed = 13, mtry_fraction = 0.1,
min_node_size = 10, max_depth = 15)
rf_model.fit_trees(n_trees=100, batch_size=25)
impTable = rf_model.variable_importance()
mt = mt.annotate_rows(vs_score = impTable[mt.locus, mt.alleles].importance)
mt = mt.annotate_rows(vs_stats = mt.aggregate_rows(hl.agg.stats(mt['vs_score'])))
mt = mt.annotate_rows(z_score = (mt['vs_score'] - mt.vs_stats.mean)/mt.vs_stats.stdev)
mt = mt.annotate_rows(vs_score_converted = 10** -mt.z_score)
title = 'Variant-Spark Manhattan plot'
hover_fields = {'rsid': mt.rsid, 'vs_score': mt.vs_score}
p = hl.plot.manhattan(pvals=mt.vs_score_converted, hover_fields=hover_fields, title=title)
p.yaxis.axis_label = 'Z score of importantce score by VS'
show(p)
mt.describe()
mt.write(S3_Path+'sample_output/dataset.mt',overwrite=True)