## Load Hail
import hail as hl
hl.init(log = 'hail_20210717.log')
Running on Apache Spark version 3.1.1 SparkUI available at http://ubuntu-a7:4040 Welcome to __ __ <>__ / /_/ /__ __/ / / __ / _ `/ / / /_/ /_/\_,_/_/_/ version 0.2.71-f3a54b530979 LOGGING: writing to hail_20210717.log
## Download 1kg data
hl.utils.get_1kg('data/')
2021-07-26 00:22:51 Hail: INFO: 1KG files found
## Import VCF & Write Matrix Table
hl.import_vcf('data/1kg.vcf.bgz').write('data/1kg.mt', overwrite=True)
2021-07-26 00:22:54 Hail: INFO: Coerced sorted dataset 2021-07-26 00:22:59 Hail: INFO: wrote matrix table with 10879 rows and 284 columns in 1 partition to data/1kg.mt Total size: 16.09 MiB * Rows/entries: 16.09 MiB * Columns: 1.31 KiB * Globals: 11.00 B * Smallest partition: 10879 rows (16.09 MiB) * Largest partition: 10879 rows (16.09 MiB)
## Read Matrix Table
mt = hl.read_matrix_table('data/1kg.mt')
mt.describe()
---------------------------------------- Global fields: None ---------------------------------------- Column fields: 's': str ---------------------------------------- Row fields: 'locus': locus<GRCh37> 'alleles': array<str> 'rsid': str 'qual': float64 'filters': set<str> 'info': struct { AC: array<int32>, AF: array<float64>, AN: int32, BaseQRankSum: float64, ClippingRankSum: float64, DP: int32, DS: bool, FS: float64, HaplotypeScore: float64, InbreedingCoeff: float64, MLEAC: array<int32>, MLEAF: array<float64>, MQ: float64, MQ0: int32, MQRankSum: float64, QD: float64, ReadPosRankSum: float64, set: str } ---------------------------------------- Entry fields: 'GT': call 'AD': array<int32> 'DP': int32 'GQ': int32 'PL': array<int32> ---------------------------------------- Column key: ['s'] Row key: ['locus', 'alleles'] ----------------------------------------
mt.rows().describe()
---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'locus': locus<GRCh37> 'alleles': array<str> 'rsid': str 'qual': float64 'filters': set<str> 'info': struct { AC: array<int32>, AF: array<float64>, AN: int32, BaseQRankSum: float64, ClippingRankSum: float64, DP: int32, DS: bool, FS: float64, HaplotypeScore: float64, InbreedingCoeff: float64, MLEAC: array<int32>, MLEAF: array<float64>, MQ: float64, MQ0: int32, MQRankSum: float64, QD: float64, ReadPosRankSum: float64, set: str } ---------------------------------------- Key: ['locus', 'alleles'] ----------------------------------------
## Import bokeh module
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()
mt = mt.filter_rows(mt.info.AF[0] > 0.05)
p1 = hl.plot.histogram(mt.info.AF[0], range=(0,1), title='MAF')
mt = mt.filter_entries(mt.GQ >= 25)
p2 = hl.plot.histogram(mt.GQ, range=(0, 100), title='GQ')
show(gridplot([p1, p2], ncols=2, plot_width=400, plot_height=400))
mt = mt.annotate_entries(n_alt = mt.GT.n_alt_alleles())
mt.n_alt.show()
showing top 10 rows
showing the first 3 of 284 columns
## Load Table
table = (hl.import_table('data/1kg_annotations.txt', impute=True)
.key_by('Sample'))
table.describe()
2021-07-26 00:23:08 Hail: INFO: Reading table to impute column types
---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'Sample': str 'Population': str 'SuperPopulation': str 'isFemale': bool 'PurpleHair': bool 'CaffeineConsumption': int32 ---------------------------------------- Key: ['Sample'] ----------------------------------------
2021-07-26 00:23:09 Hail: INFO: Finished type imputation Loading field 'Sample' as type str (imputed) Loading field 'Population' as type str (imputed) Loading field 'SuperPopulation' as type str (imputed) Loading field 'isFemale' as type bool (imputed) Loading field 'PurpleHair' as type bool (imputed) Loading field 'CaffeineConsumption' as type int32 (imputed)
table.show()
showing top 10 rows
mt.annotate_cols(isFemale = table[mt.s].isFemale).describe()
---------------------------------------- Global fields: None ---------------------------------------- Column fields: 's': str 'isFemale': bool ---------------------------------------- Row fields: 'locus': locus<GRCh37> 'alleles': array<str> 'rsid': str 'qual': float64 'filters': set<str> 'info': struct { AC: array<int32>, AF: array<float64>, AN: int32, BaseQRankSum: float64, ClippingRankSum: float64, DP: int32, DS: bool, FS: float64, HaplotypeScore: float64, InbreedingCoeff: float64, MLEAC: array<int32>, MLEAF: array<float64>, MQ: float64, MQ0: int32, MQRankSum: float64, QD: float64, ReadPosRankSum: float64, set: str } ---------------------------------------- Entry fields: 'GT': call 'AD': array<int32> 'DP': int32 'GQ': int32 'PL': array<int32> 'n_alt': int32 ---------------------------------------- Column key: ['s'] Row key: ['locus', 'alleles'] ----------------------------------------
mt.annotate_cols(**table[mt.s]).describe()
---------------------------------------- Global fields: None ---------------------------------------- Column fields: 's': str 'Population': str 'SuperPopulation': str 'isFemale': bool 'PurpleHair': bool 'CaffeineConsumption': int32 ---------------------------------------- Row fields: 'locus': locus<GRCh37> 'alleles': array<str> 'rsid': str 'qual': float64 'filters': set<str> 'info': struct { AC: array<int32>, AF: array<float64>, AN: int32, BaseQRankSum: float64, ClippingRankSum: float64, DP: int32, DS: bool, FS: float64, HaplotypeScore: float64, InbreedingCoeff: float64, MLEAC: array<int32>, MLEAF: array<float64>, MQ: float64, MQ0: int32, MQRankSum: float64, QD: float64, ReadPosRankSum: float64, set: str } ---------------------------------------- Entry fields: 'GT': call 'AD': array<int32> 'DP': int32 'GQ': int32 'PL': array<int32> 'n_alt': int32 ---------------------------------------- Column key: ['s'] Row key: ['locus', 'alleles'] ----------------------------------------
mt = mt.annotate_cols(pheno = table[mt.s])
mt.describe()
---------------------------------------- Global fields: None ---------------------------------------- Column fields: 's': str 'pheno': struct { Population: str, SuperPopulation: str, isFemale: bool, PurpleHair: bool, CaffeineConsumption: int32 } ---------------------------------------- Row fields: 'locus': locus<GRCh37> 'alleles': array<str> 'rsid': str 'qual': float64 'filters': set<str> 'info': struct { AC: array<int32>, AF: array<float64>, AN: int32, BaseQRankSum: float64, ClippingRankSum: float64, DP: int32, DS: bool, FS: float64, HaplotypeScore: float64, InbreedingCoeff: float64, MLEAC: array<int32>, MLEAF: array<float64>, MQ: float64, MQ0: int32, MQRankSum: float64, QD: float64, ReadPosRankSum: float64, set: str } ---------------------------------------- Entry fields: 'GT': call 'AD': array<int32> 'DP': int32 'GQ': int32 'PL': array<int32> 'n_alt': int32 ---------------------------------------- Column key: ['s'] Row key: ['locus', 'alleles'] ----------------------------------------
mt.select_entries(mt.GT).describe()
---------------------------------------- Global fields: None ---------------------------------------- Column fields: 's': str 'pheno': struct { Population: str, SuperPopulation: str, isFemale: bool, PurpleHair: bool, CaffeineConsumption: int32 } ---------------------------------------- Row fields: 'locus': locus<GRCh37> 'alleles': array<str> 'rsid': str 'qual': float64 'filters': set<str> 'info': struct { AC: array<int32>, AF: array<float64>, AN: int32, BaseQRankSum: float64, ClippingRankSum: float64, DP: int32, DS: bool, FS: float64, HaplotypeScore: float64, InbreedingCoeff: float64, MLEAC: array<int32>, MLEAF: array<float64>, MQ: float64, MQ0: int32, MQRankSum: float64, QD: float64, ReadPosRankSum: float64, set: str } ---------------------------------------- Entry fields: 'GT': call ---------------------------------------- Column key: ['s'] Row key: ['locus', 'alleles'] ----------------------------------------
mt.select_rows().describe()
---------------------------------------- Global fields: None ---------------------------------------- Column fields: 's': str 'pheno': struct { Population: str, SuperPopulation: str, isFemale: bool, PurpleHair: bool, CaffeineConsumption: int32 } ---------------------------------------- Row fields: 'locus': locus<GRCh37> 'alleles': array<str> ---------------------------------------- Entry fields: 'GT': call 'AD': array<int32> 'DP': int32 'GQ': int32 'PL': array<int32> 'n_alt': int32 ---------------------------------------- Column key: ['s'] Row key: ['locus', 'alleles'] ----------------------------------------
mt.select_cols(pop = mt.pheno.Population).describe()
---------------------------------------- Global fields: None ---------------------------------------- Column fields: 's': str 'pop': str ---------------------------------------- Row fields: 'locus': locus<GRCh37> 'alleles': array<str> 'rsid': str 'qual': float64 'filters': set<str> 'info': struct { AC: array<int32>, AF: array<float64>, AN: int32, BaseQRankSum: float64, ClippingRankSum: float64, DP: int32, DS: bool, FS: float64, HaplotypeScore: float64, InbreedingCoeff: float64, MLEAC: array<int32>, MLEAF: array<float64>, MQ: float64, MQ0: int32, MQRankSum: float64, QD: float64, ReadPosRankSum: float64, set: str } ---------------------------------------- Entry fields: 'GT': call 'AD': array<int32> 'DP': int32 'GQ': int32 'PL': array<int32> 'n_alt': int32 ---------------------------------------- Column key: ['s'] Row key: ['locus', 'alleles'] ----------------------------------------
mt.select_cols(**mt.pheno).describe()
---------------------------------------- Global fields: None ---------------------------------------- Column fields: 's': str 'Population': str 'SuperPopulation': str 'isFemale': bool 'PurpleHair': bool 'CaffeineConsumption': int32 ---------------------------------------- Row fields: 'locus': locus<GRCh37> 'alleles': array<str> 'rsid': str 'qual': float64 'filters': set<str> 'info': struct { AC: array<int32>, AF: array<float64>, AN: int32, BaseQRankSum: float64, ClippingRankSum: float64, DP: int32, DS: bool, FS: float64, HaplotypeScore: float64, InbreedingCoeff: float64, MLEAC: array<int32>, MLEAF: array<float64>, MQ: float64, MQ0: int32, MQRankSum: float64, QD: float64, ReadPosRankSum: float64, set: str } ---------------------------------------- Entry fields: 'GT': call 'AD': array<int32> 'DP': int32 'GQ': int32 'PL': array<int32> 'n_alt': int32 ---------------------------------------- Column key: ['s'] Row key: ['locus', 'alleles'] ----------------------------------------
Cols = mt.cols()
Cols.flatten().describe()
---------------------------------------- Global fields: None ---------------------------------------- Row fields: 's': str 'pheno.Population': str 'pheno.SuperPopulation': str 'pheno.isFemale': bool 'pheno.PurpleHair': bool 'pheno.CaffeineConsumption': int32 ---------------------------------------- Key: [] ----------------------------------------
2021-07-26 00:23:11 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'. To preserve matrix table column order, first unkey columns with 'key_cols_by()'
## Export PLINK file
hl.export_plink(mt, 'data/1kg',
ind_id = mt.s,
is_female = mt.pheno.isFemale,
pheno = mt.pheno.PurpleHair)
2021-07-26 00:23:23 Hail: INFO: merging 2 files totalling 597.7K... 2021-07-26 00:23:23 Hail: INFO: while writing: data/1kg.bed merge time: 38.794ms 2021-07-26 00:23:23 Hail: INFO: merging 1 files totalling 297.1K... 2021-07-26 00:23:23 Hail: INFO: while writing: data/1kg.bim merge time: 24.467ms 2021-07-26 00:23:26 Hail: INFO: merging 40 files totalling 5.0K... 2021-07-26 00:23:26 Hail: INFO: while writing: data/1kg.fam merge time: 78.735ms 2021-07-26 00:23:26 Hail: INFO: wrote 8620 variants and 284 samples to 'data/1kg'