## Load Hail
import hail as hl
hl.init(log = 'hail_20210717.log')

Running on Apache Spark version 3.1.1
SparkUI available at http://ubuntu-a7:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.71-f3a54b530979
LOGGING: writing to hail_20210717.log


## Download 1kg data
hl.utils.get_1kg('data/')

2021-07-26 00:22:51 Hail: INFO: 1KG files found


## Import VCF & Write Matrix Table
hl.import_vcf('data/1kg.vcf.bgz').write('data/1kg.mt', overwrite=True)

2021-07-26 00:22:54 Hail: INFO: Coerced sorted dataset
2021-07-26 00:22:59 Hail: INFO: wrote matrix table with 10879 rows and 284 columns in 1 partition to data/1kg.mt
    Total size: 16.09 MiB
    * Rows/entries: 16.09 MiB
    * Columns: 1.31 KiB
    * Globals: 11.00 B
    * Smallest partition: 10879 rows (16.09 MiB)
    * Largest partition:  10879 rows (16.09 MiB)


## Read Matrix Table
mt = hl.read_matrix_table('data/1kg.mt')


mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        ClippingRankSum: float64, 
        DP: int32, 
        DS: bool, 
        FS: float64, 
        HaplotypeScore: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQ0: int32, 
        MQRankSum: float64, 
        QD: float64, 
        ReadPosRankSum: float64, 
        set: str
    }
----------------------------------------
Entry fields:
    'GT': call
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'PL': array<int32>
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------


mt.rows().describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh37> 
    'alleles': array<str> 
    'rsid': str 
    'qual': float64 
    'filters': set<str> 
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        ClippingRankSum: float64, 
        DP: int32, 
        DS: bool, 
        FS: float64, 
        HaplotypeScore: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQ0: int32, 
        MQRankSum: float64, 
        QD: float64, 
        ReadPosRankSum: float64, 
        set: str
    } 
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------


## Import bokeh module
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()


mt = mt.filter_rows(mt.info.AF[0] > 0.05)
p1 = hl.plot.histogram(mt.info.AF[0], range=(0,1), title='MAF')

mt = mt.filter_entries(mt.GQ >= 25)
p2 = hl.plot.histogram(mt.GQ, range=(0, 100), title='GQ')

show(gridplot([p1, p2], ncols=2, plot_width=400, plot_height=400))


mt = mt.annotate_entries(n_alt = mt.GT.n_alt_alleles())
mt.n_alt.show()


## Load Table
table = (hl.import_table('data/1kg_annotations.txt', impute=True)
           .key_by('Sample'))
table.describe()

2021-07-26 00:23:08 Hail: INFO: Reading table to impute column types

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'Sample': str 
    'Population': str 
    'SuperPopulation': str 
    'isFemale': bool 
    'PurpleHair': bool 
    'CaffeineConsumption': int32 
----------------------------------------
Key: ['Sample']
----------------------------------------

2021-07-26 00:23:09 Hail: INFO: Finished type imputation
  Loading field 'Sample' as type str (imputed)
  Loading field 'Population' as type str (imputed)
  Loading field 'SuperPopulation' as type str (imputed)
  Loading field 'isFemale' as type bool (imputed)
  Loading field 'PurpleHair' as type bool (imputed)
  Loading field 'CaffeineConsumption' as type int32 (imputed)


table.show()


mt.annotate_cols(isFemale = table[mt.s].isFemale).describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'isFemale': bool
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        ClippingRankSum: float64, 
        DP: int32, 
        DS: bool, 
        FS: float64, 
        HaplotypeScore: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQ0: int32, 
        MQRankSum: float64, 
        QD: float64, 
        ReadPosRankSum: float64, 
        set: str
    }
----------------------------------------
Entry fields:
    'GT': call
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'PL': array<int32>
    'n_alt': int32
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------


mt.annotate_cols(**table[mt.s]).describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'Population': str
    'SuperPopulation': str
    'isFemale': bool
    'PurpleHair': bool
    'CaffeineConsumption': int32
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        ClippingRankSum: float64, 
        DP: int32, 
        DS: bool, 
        FS: float64, 
        HaplotypeScore: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQ0: int32, 
        MQRankSum: float64, 
        QD: float64, 
        ReadPosRankSum: float64, 
        set: str
    }
----------------------------------------
Entry fields:
    'GT': call
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'PL': array<int32>
    'n_alt': int32
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------


mt = mt.annotate_cols(pheno = table[mt.s])
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'pheno': struct {
        Population: str, 
        SuperPopulation: str, 
        isFemale: bool, 
        PurpleHair: bool, 
        CaffeineConsumption: int32
    }
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        ClippingRankSum: float64, 
        DP: int32, 
        DS: bool, 
        FS: float64, 
        HaplotypeScore: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQ0: int32, 
        MQRankSum: float64, 
        QD: float64, 
        ReadPosRankSum: float64, 
        set: str
    }
----------------------------------------
Entry fields:
    'GT': call
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'PL': array<int32>
    'n_alt': int32
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------


mt.select_entries(mt.GT).describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'pheno': struct {
        Population: str, 
        SuperPopulation: str, 
        isFemale: bool, 
        PurpleHair: bool, 
        CaffeineConsumption: int32
    }
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        ClippingRankSum: float64, 
        DP: int32, 
        DS: bool, 
        FS: float64, 
        HaplotypeScore: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQ0: int32, 
        MQRankSum: float64, 
        QD: float64, 
        ReadPosRankSum: float64, 
        set: str
    }
----------------------------------------
Entry fields:
    'GT': call
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------


mt.select_rows().describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'pheno': struct {
        Population: str, 
        SuperPopulation: str, 
        isFemale: bool, 
        PurpleHair: bool, 
        CaffeineConsumption: int32
    }
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
----------------------------------------
Entry fields:
    'GT': call
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'PL': array<int32>
    'n_alt': int32
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------


mt.select_cols(pop = mt.pheno.Population).describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'pop': str
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        ClippingRankSum: float64, 
        DP: int32, 
        DS: bool, 
        FS: float64, 
        HaplotypeScore: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQ0: int32, 
        MQRankSum: float64, 
        QD: float64, 
        ReadPosRankSum: float64, 
        set: str
    }
----------------------------------------
Entry fields:
    'GT': call
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'PL': array<int32>
    'n_alt': int32
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------


mt.select_cols(**mt.pheno).describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'Population': str
    'SuperPopulation': str
    'isFemale': bool
    'PurpleHair': bool
    'CaffeineConsumption': int32
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        ClippingRankSum: float64, 
        DP: int32, 
        DS: bool, 
        FS: float64, 
        HaplotypeScore: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQ0: int32, 
        MQRankSum: float64, 
        QD: float64, 
        ReadPosRankSum: float64, 
        set: str
    }
----------------------------------------
Entry fields:
    'GT': call
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'PL': array<int32>
    'n_alt': int32
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------


Cols = mt.cols()
Cols.flatten().describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    's': str 
    'pheno.Population': str 
    'pheno.SuperPopulation': str 
    'pheno.isFemale': bool 
    'pheno.PurpleHair': bool 
    'pheno.CaffeineConsumption': int32 
----------------------------------------
Key: []
----------------------------------------

2021-07-26 00:23:11 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'


## Export PLINK file
hl.export_plink(mt, 'data/1kg',
                ind_id = mt.s,
                is_female = mt.pheno.isFemale,
                pheno = mt.pheno.PurpleHair)

2021-07-26 00:23:23 Hail: INFO: merging 2 files totalling 597.7K...
2021-07-26 00:23:23 Hail: INFO: while writing:
    data/1kg.bed
  merge time: 38.794ms
2021-07-26 00:23:23 Hail: INFO: merging 1 files totalling 297.1K...
2021-07-26 00:23:23 Hail: INFO: while writing:
    data/1kg.bim
  merge time: 24.467ms
2021-07-26 00:23:26 Hail: INFO: merging 40 files totalling 5.0K...
2021-07-26 00:23:26 Hail: INFO: while writing:
    data/1kg.fam
  merge time: 78.735ms
2021-07-26 00:23:26 Hail: INFO: wrote 8620 variants and 284 samples to 'data/1kg'


		'HG00096'	'HG00099'	'HG00105'
locus	alleles	n_alt	n_alt	n_alt
locus<GRCh37>	array<str>	int32	int32	int32
1:904165	["G","A"]	NA	NA	NA
1:1707740	["T","G"]	1	1	1
1:2284195	["T","C"]	NA	1	1
1:2779043	["T","C"]	1	1	NA
1:2944527	["G","A"]	NA	1	1
1:3803755	["T","C"]	NA	2	1
1:4121584	["A","G"]	1	NA	0
1:4170048	["C","T"]	NA	NA	1
1:4180842	["C","T"]	1	1	NA
1:6053630	["T","G"]	NA	NA	NA


Sample	Population	SuperPopulation	isFemale	PurpleHair	CaffeineConsumption
str	str	str	bool	bool	int32
"HG00096"	"GBR"	"EUR"	False	False	4
"HG00097"	"GBR"	"EUR"	True	True	4
"HG00098"	"GBR"	"EUR"	False	False	5
"HG00099"	"GBR"	"EUR"	True	False	4
"HG00100"	"GBR"	"EUR"	True	False	5
"HG00101"	"GBR"	"EUR"	False	True	1
"HG00102"	"GBR"	"EUR"	True	True	6
"HG00103"	"GBR"	"EUR"	False	True	5
"HG00104"	"GBR"	"EUR"	True	False	5
"HG00105"	"GBR"	"EUR"	False	False	4