In [1]:
## Load Hail
import hail as hl
hl.init(log = 'hail_20210717.log')
Running on Apache Spark version 3.1.1
SparkUI available at http://ubuntu-a7:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.71-f3a54b530979
LOGGING: writing to hail_20210717.log
In [2]:
## Download 1kg data
hl.utils.get_1kg('data/')
2021-07-26 00:22:51 Hail: INFO: 1KG files found
In [3]:
## Import VCF & Write Matrix Table
hl.import_vcf('data/1kg.vcf.bgz').write('data/1kg.mt', overwrite=True)
2021-07-26 00:22:54 Hail: INFO: Coerced sorted dataset
2021-07-26 00:22:59 Hail: INFO: wrote matrix table with 10879 rows and 284 columns in 1 partition to data/1kg.mt
    Total size: 16.09 MiB
    * Rows/entries: 16.09 MiB
    * Columns: 1.31 KiB
    * Globals: 11.00 B
    * Smallest partition: 10879 rows (16.09 MiB)
    * Largest partition:  10879 rows (16.09 MiB)
In [4]:
## Read Matrix Table
mt = hl.read_matrix_table('data/1kg.mt')
In [5]:
mt.describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        ClippingRankSum: float64, 
        DP: int32, 
        DS: bool, 
        FS: float64, 
        HaplotypeScore: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQ0: int32, 
        MQRankSum: float64, 
        QD: float64, 
        ReadPosRankSum: float64, 
        set: str
    }
----------------------------------------
Entry fields:
    'GT': call
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'PL': array<int32>
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------
In [6]:
mt.rows().describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh37> 
    'alleles': array<str> 
    'rsid': str 
    'qual': float64 
    'filters': set<str> 
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        ClippingRankSum: float64, 
        DP: int32, 
        DS: bool, 
        FS: float64, 
        HaplotypeScore: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQ0: int32, 
        MQRankSum: float64, 
        QD: float64, 
        ReadPosRankSum: float64, 
        set: str
    } 
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------
In [7]:
## Import bokeh module
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()
Loading BokehJS ...
In [8]:
mt = mt.filter_rows(mt.info.AF[0] > 0.05)
p1 = hl.plot.histogram(mt.info.AF[0], range=(0,1), title='MAF')

mt = mt.filter_entries(mt.GQ >= 25)
p2 = hl.plot.histogram(mt.GQ, range=(0, 100), title='GQ')

show(gridplot([p1, p2], ncols=2, plot_width=400, plot_height=400))
In [9]:
mt = mt.annotate_entries(n_alt = mt.GT.n_alt_alleles())
mt.n_alt.show()
'HG00096'
'HG00099'
'HG00105'
locus
alleles
n_alt
n_alt
n_alt
locus<GRCh37>array<str>int32int32int32
1:904165["G","A"]NANANA
1:1707740["T","G"]111
1:2284195["T","C"]NA11
1:2779043["T","C"]11NA
1:2944527["G","A"]NA11
1:3803755["T","C"]NA21
1:4121584["A","G"]1NA0
1:4170048["C","T"]NANA1
1:4180842["C","T"]11NA
1:6053630["T","G"]NANANA

showing top 10 rows

showing the first 3 of 284 columns

In [10]:
## Load Table
table = (hl.import_table('data/1kg_annotations.txt', impute=True)
           .key_by('Sample'))
table.describe()
2021-07-26 00:23:08 Hail: INFO: Reading table to impute column types
----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'Sample': str 
    'Population': str 
    'SuperPopulation': str 
    'isFemale': bool 
    'PurpleHair': bool 
    'CaffeineConsumption': int32 
----------------------------------------
Key: ['Sample']
----------------------------------------
2021-07-26 00:23:09 Hail: INFO: Finished type imputation
  Loading field 'Sample' as type str (imputed)
  Loading field 'Population' as type str (imputed)
  Loading field 'SuperPopulation' as type str (imputed)
  Loading field 'isFemale' as type bool (imputed)
  Loading field 'PurpleHair' as type bool (imputed)
  Loading field 'CaffeineConsumption' as type int32 (imputed)
In [11]:
table.show()
Sample
Population
SuperPopulation
isFemale
PurpleHair
CaffeineConsumption
strstrstrboolboolint32
"HG00096""GBR""EUR"FalseFalse4
"HG00097""GBR""EUR"TrueTrue4
"HG00098""GBR""EUR"FalseFalse5
"HG00099""GBR""EUR"TrueFalse4
"HG00100""GBR""EUR"TrueFalse5
"HG00101""GBR""EUR"FalseTrue1
"HG00102""GBR""EUR"TrueTrue6
"HG00103""GBR""EUR"FalseTrue5
"HG00104""GBR""EUR"TrueFalse5
"HG00105""GBR""EUR"FalseFalse4

showing top 10 rows

In [12]:
mt.annotate_cols(isFemale = table[mt.s].isFemale).describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'isFemale': bool
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        ClippingRankSum: float64, 
        DP: int32, 
        DS: bool, 
        FS: float64, 
        HaplotypeScore: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQ0: int32, 
        MQRankSum: float64, 
        QD: float64, 
        ReadPosRankSum: float64, 
        set: str
    }
----------------------------------------
Entry fields:
    'GT': call
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'PL': array<int32>
    'n_alt': int32
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------
In [13]:
mt.annotate_cols(**table[mt.s]).describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'Population': str
    'SuperPopulation': str
    'isFemale': bool
    'PurpleHair': bool
    'CaffeineConsumption': int32
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        ClippingRankSum: float64, 
        DP: int32, 
        DS: bool, 
        FS: float64, 
        HaplotypeScore: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQ0: int32, 
        MQRankSum: float64, 
        QD: float64, 
        ReadPosRankSum: float64, 
        set: str
    }
----------------------------------------
Entry fields:
    'GT': call
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'PL': array<int32>
    'n_alt': int32
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------
In [14]:
mt = mt.annotate_cols(pheno = table[mt.s])
mt.describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'pheno': struct {
        Population: str, 
        SuperPopulation: str, 
        isFemale: bool, 
        PurpleHair: bool, 
        CaffeineConsumption: int32
    }
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        ClippingRankSum: float64, 
        DP: int32, 
        DS: bool, 
        FS: float64, 
        HaplotypeScore: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQ0: int32, 
        MQRankSum: float64, 
        QD: float64, 
        ReadPosRankSum: float64, 
        set: str
    }
----------------------------------------
Entry fields:
    'GT': call
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'PL': array<int32>
    'n_alt': int32
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------
In [15]:
mt.select_entries(mt.GT).describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'pheno': struct {
        Population: str, 
        SuperPopulation: str, 
        isFemale: bool, 
        PurpleHair: bool, 
        CaffeineConsumption: int32
    }
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        ClippingRankSum: float64, 
        DP: int32, 
        DS: bool, 
        FS: float64, 
        HaplotypeScore: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQ0: int32, 
        MQRankSum: float64, 
        QD: float64, 
        ReadPosRankSum: float64, 
        set: str
    }
----------------------------------------
Entry fields:
    'GT': call
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------
In [16]:
mt.select_rows().describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'pheno': struct {
        Population: str, 
        SuperPopulation: str, 
        isFemale: bool, 
        PurpleHair: bool, 
        CaffeineConsumption: int32
    }
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
----------------------------------------
Entry fields:
    'GT': call
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'PL': array<int32>
    'n_alt': int32
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------
In [17]:
mt.select_cols(pop = mt.pheno.Population).describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'pop': str
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        ClippingRankSum: float64, 
        DP: int32, 
        DS: bool, 
        FS: float64, 
        HaplotypeScore: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQ0: int32, 
        MQRankSum: float64, 
        QD: float64, 
        ReadPosRankSum: float64, 
        set: str
    }
----------------------------------------
Entry fields:
    'GT': call
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'PL': array<int32>
    'n_alt': int32
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------
In [18]:
mt.select_cols(**mt.pheno).describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'Population': str
    'SuperPopulation': str
    'isFemale': bool
    'PurpleHair': bool
    'CaffeineConsumption': int32
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        ClippingRankSum: float64, 
        DP: int32, 
        DS: bool, 
        FS: float64, 
        HaplotypeScore: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQ0: int32, 
        MQRankSum: float64, 
        QD: float64, 
        ReadPosRankSum: float64, 
        set: str
    }
----------------------------------------
Entry fields:
    'GT': call
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'PL': array<int32>
    'n_alt': int32
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------
In [19]:
Cols = mt.cols()
Cols.flatten().describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    's': str 
    'pheno.Population': str 
    'pheno.SuperPopulation': str 
    'pheno.isFemale': bool 
    'pheno.PurpleHair': bool 
    'pheno.CaffeineConsumption': int32 
----------------------------------------
Key: []
----------------------------------------
2021-07-26 00:23:11 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'
In [20]:
## Export PLINK file
hl.export_plink(mt, 'data/1kg',
                ind_id = mt.s,
                is_female = mt.pheno.isFemale,
                pheno = mt.pheno.PurpleHair)
2021-07-26 00:23:23 Hail: INFO: merging 2 files totalling 597.7K...
2021-07-26 00:23:23 Hail: INFO: while writing:
    data/1kg.bed
  merge time: 38.794ms
2021-07-26 00:23:23 Hail: INFO: merging 1 files totalling 297.1K...
2021-07-26 00:23:23 Hail: INFO: while writing:
    data/1kg.bim
  merge time: 24.467ms
2021-07-26 00:23:26 Hail: INFO: merging 40 files totalling 5.0K...
2021-07-26 00:23:26 Hail: INFO: while writing:
    data/1kg.fam
  merge time: 78.735ms
2021-07-26 00:23:26 Hail: INFO: wrote 8620 variants and 284 samples to 'data/1kg'