In [1]:
## Load Hail
import hail as hl
In [2]:
hl.init(log = 'hail_20210710.log')
Running on Apache Spark version 3.1.1
SparkUI available at http://ubuntu:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.68-13190f0b6103
LOGGING: writing to hail_20210710.log
In [3]:
## Import Tables
gene_expr = hl.import_table('gene_expr_join.tsv', impute=True)
gene_loc = hl.import_table('gene_loc_join.tsv', impute=True)
2021-07-10 20:11:14 Hail: INFO: Reading table to impute column types
2021-07-10 20:11:17 Hail: INFO: Finished type imputation
  Loading field 'id' as type str (imputed)
  Loading field 'Sam_01' as type float64 (imputed)
  Loading field 'Sam_02' as type float64 (imputed)
  Loading field 'Sam_03' as type float64 (imputed)
  Loading field 'Sam_04' as type float64 (imputed)
  Loading field 'Sam_05' as type float64 (imputed)
2021-07-10 20:11:17 Hail: INFO: Reading table to impute column types
2021-07-10 20:11:17 Hail: INFO: Finished type imputation
  Loading field 'geneid' as type str (imputed)
  Loading field 'chr' as type str (imputed)
  Loading field 'start' as type int32 (imputed)
  Loading field 'end' as type int32 (imputed)
In [12]:
gene_expr.show()
gene_loc.show()
id
Sam_01
Sam_02
Sam_03
Sam_04
Sam_05
strfloat64float64float64float64float64
"Gene_01"4.91e+004.63e+005.18e+005.07e+005.74e+00
"Gene_02"1.38e+011.31e+011.32e+011.30e+011.29e+01
"Gene_03"1.21e+011.23e+011.31e+011.37e+011.39e+01
"Gene_04"1.16e+011.19e+011.27e+011.27e+011.32e+01
"Gene_05"1.47e+011.47e+011.46e+011.59e+011.55e+01
"Gene_06"1.23e+011.22e+011.25e+011.32e+011.26e+01
"Gene_07"1.26e+011.27e+011.25e+011.34e+011.36e+01
"Gene_08"1.23e+011.26e+011.26e+011.30e+011.29e+01
"Gene_09"9.82e+009.29e+008.95e+008.18e+008.11e+00
"Gene_10"1.42e+011.45e+011.46e+011.37e+011.35e+01
geneid
chr
start
end
strstrint32int32
"Gene_01""chr1"721289731289
"Gene_02""chr1"752565762565
"Gene_03""chr1"777121787121
"Gene_04""chr1"785988795988
"Gene_05""chr1"792479802479
"Gene_06""chr1"798958808958
"Gene_07""chr1"888658898658
"Gene_08""chr1"918572928572
"Gene_09""chr1"926430936430
"Gene_10""chr1"10000001010000
In [20]:
gene_expr = gene_expr.key_by('id')
gene_expr.describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'id': str 
    'Sam_01': float64 
    'Sam_02': float64 
    'Sam_03': float64 
    'Sam_04': float64 
    'Sam_05': float64 
----------------------------------------
Key: ['id']
----------------------------------------
In [21]:
gene_loc = gene_loc.key_by(gene_loc.geneid)
gene_loc.describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'geneid': str 
    'chr': str 
    'start': int32 
    'end': int32 
----------------------------------------
Key: ['geneid']
----------------------------------------
In [61]:
## Join the Tables (Inner Join)
join_table = gene_expr.join(gene_loc)
In [62]:
join_table.describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'id': str 
    'Sam_01': float64 
    'Sam_02': float64 
    'Sam_03': float64 
    'Sam_04': float64 
    'Sam_05': float64 
    'chr': str 
    'start': int32 
    'end': int32 
----------------------------------------
Key: ['id']
----------------------------------------
In [58]:
## Annotate a Field (chr) from gene_loc Table
gene_expr = gene_expr.key_by()
annot_table = gene_expr.annotate(chr = gene_loc[gene_expr.id].chr)
annot_table.describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'id': str 
    'Sam_01': float64 
    'Sam_02': float64 
    'Sam_03': float64 
    'Sam_04': float64 
    'Sam_05': float64 
    'chr': str 
----------------------------------------
Key: []
----------------------------------------
In [36]:
## Downsample gene_loc Table
filtered_gene_loc = gene_loc.sample(0.5)
filtered_gene_loc.show()
2021-07-10 20:59:51 Hail: INFO: Coerced sorted dataset
geneid
chr
start
end
strstrint32int32
"Gene_01""chr1"721289731289
"Gene_02""chr1"752565762565
"Gene_06""chr1"798958808958
"Gene_07""chr1"888658898658
In [59]:
## Filter gene_expr Table through key in filtered_gene_loc Table
gene_expr = gene_expr.key_by(gene_expr.id)
filtered_gene_expr = gene_expr.semi_join(filtered_gene_loc)
filtered_gene_expr.show()
2021-07-11 01:04:21 Hail: INFO: Coerced sorted dataset
2021-07-11 01:04:22 Hail: INFO: Coerced sorted dataset
id
Sam_01
Sam_02
Sam_03
Sam_04
Sam_05
strfloat64float64float64float64float64
"Gene_01"4.91e+004.63e+005.18e+005.07e+005.74e+00
"Gene_02"1.38e+011.31e+011.32e+011.30e+011.29e+01
"Gene_06"1.23e+011.22e+011.25e+011.32e+011.26e+01
"Gene_07"1.26e+011.27e+011.25e+011.34e+011.36e+01
In [56]:
gene_expr = gene_expr.key_by()
filtered_gene_expr = gene_expr.filter(hl.is_defined(filtered_gene_loc[gene_expr.id]))
filtered_gene_expr.show()
2021-07-11 01:01:48 Hail: INFO: Coerced sorted dataset
2021-07-11 01:01:48 Hail: INFO: Coerced sorted dataset
id
Sam_01
Sam_02
Sam_03
Sam_04
Sam_05
strfloat64float64float64float64float64
"Gene_01"4.91e+004.63e+005.18e+005.07e+005.74e+00
"Gene_02"1.38e+011.31e+011.32e+011.30e+011.29e+01
"Gene_06"1.23e+011.22e+011.25e+011.32e+011.26e+01
"Gene_07"1.26e+011.27e+011.25e+011.34e+011.36e+01
In [63]:
# Export the Table
join_table.export('join_table.tsv')
2021-07-11 01:05:24 Hail: INFO: Coerced sorted dataset
2021-07-11 01:05:24 Hail: INFO: Coerced sorted dataset
2021-07-11 01:05:25 Hail: INFO: merging 1 files totalling 874...
2021-07-11 01:05:25 Hail: INFO: while writing:
    join_table.tsv
  merge time: 55.756ms