## Load Hail
import hail as hl
hl.init(log = 'hail_20210710.log')
Running on Apache Spark version 3.1.1 SparkUI available at http://ubuntu:4040 Welcome to __ __ <>__ / /_/ /__ __/ / / __ / _ `/ / / /_/ /_/\_,_/_/_/ version 0.2.68-13190f0b6103 LOGGING: writing to hail_20210710.log
## Import Tables
gene_expr = hl.import_table('gene_expr_join.tsv', impute=True)
gene_loc = hl.import_table('gene_loc_join.tsv', impute=True)
2021-07-10 20:11:14 Hail: INFO: Reading table to impute column types 2021-07-10 20:11:17 Hail: INFO: Finished type imputation Loading field 'id' as type str (imputed) Loading field 'Sam_01' as type float64 (imputed) Loading field 'Sam_02' as type float64 (imputed) Loading field 'Sam_03' as type float64 (imputed) Loading field 'Sam_04' as type float64 (imputed) Loading field 'Sam_05' as type float64 (imputed) 2021-07-10 20:11:17 Hail: INFO: Reading table to impute column types 2021-07-10 20:11:17 Hail: INFO: Finished type imputation Loading field 'geneid' as type str (imputed) Loading field 'chr' as type str (imputed) Loading field 'start' as type int32 (imputed) Loading field 'end' as type int32 (imputed)
gene_expr.show()
gene_loc.show()
gene_expr = gene_expr.key_by('id')
gene_expr.describe()
---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'id': str 'Sam_01': float64 'Sam_02': float64 'Sam_03': float64 'Sam_04': float64 'Sam_05': float64 ---------------------------------------- Key: ['id'] ----------------------------------------
gene_loc = gene_loc.key_by(gene_loc.geneid)
gene_loc.describe()
---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'geneid': str 'chr': str 'start': int32 'end': int32 ---------------------------------------- Key: ['geneid'] ----------------------------------------
## Join the Tables (Inner Join)
join_table = gene_expr.join(gene_loc)
join_table.describe()
---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'id': str 'Sam_01': float64 'Sam_02': float64 'Sam_03': float64 'Sam_04': float64 'Sam_05': float64 'chr': str 'start': int32 'end': int32 ---------------------------------------- Key: ['id'] ----------------------------------------
## Annotate a Field (chr) from gene_loc Table
gene_expr = gene_expr.key_by()
annot_table = gene_expr.annotate(chr = gene_loc[gene_expr.id].chr)
annot_table.describe()
---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'id': str 'Sam_01': float64 'Sam_02': float64 'Sam_03': float64 'Sam_04': float64 'Sam_05': float64 'chr': str ---------------------------------------- Key: [] ----------------------------------------
## Downsample gene_loc Table
filtered_gene_loc = gene_loc.sample(0.5)
filtered_gene_loc.show()
2021-07-10 20:59:51 Hail: INFO: Coerced sorted dataset
## Filter gene_expr Table through key in filtered_gene_loc Table
gene_expr = gene_expr.key_by(gene_expr.id)
filtered_gene_expr = gene_expr.semi_join(filtered_gene_loc)
filtered_gene_expr.show()
2021-07-11 01:04:21 Hail: INFO: Coerced sorted dataset 2021-07-11 01:04:22 Hail: INFO: Coerced sorted dataset
gene_expr = gene_expr.key_by()
filtered_gene_expr = gene_expr.filter(hl.is_defined(filtered_gene_loc[gene_expr.id]))
filtered_gene_expr.show()
2021-07-11 01:01:48 Hail: INFO: Coerced sorted dataset 2021-07-11 01:01:48 Hail: INFO: Coerced sorted dataset
# Export the Table
join_table.export('join_table.tsv')
2021-07-11 01:05:24 Hail: INFO: Coerced sorted dataset 2021-07-11 01:05:24 Hail: INFO: Coerced sorted dataset 2021-07-11 01:05:25 Hail: INFO: merging 1 files totalling 874... 2021-07-11 01:05:25 Hail: INFO: while writing: join_table.tsv merge time: 55.756ms