In [1]:
import pandas as pd
import glob
import numpy as np
import re
In [2]:
import matplotlib
matplotlib.rcParams["svg.fonttype"] = "none"
matplotlib.rcParams["text.usetex"] = False
import matplotlib.pyplot as plt
Scenario 1: return a list most similar region sets and their similarity score to a query metadata label (l2r)¶
In [3]:
# Path to the pre-calculated distance file between label embedding and region set embeddings
path_simfile = './distance_l2r.csv'
distance = pd.read_csv(path_simfile)
distance.file_label = distance.file_label.str.lower()
distance.search_term = distance.search_term.str.lower()
distance = distance.drop_duplicates()
In [4]:
# Print the search terms (labels)
print(distance.search_term.unique())
['h3k4me3' 'h3k27me3' 'h3k27ac' 'h3k4me1' 'h3k9me3' 'h3k4me2' 'h3k9ac' 'h3k79me2' 'h4k20me1' 'h3k9me2' 'h3k9me1']
In [5]:
def S1(searchterm, distance):
nof = len(distance[distance.file_label.str.contains(searchterm)])
df = distance[distance.search_term == searchterm].sort_values(by=['score'], ascending = False)[0:10]
df = df.sort_values(by=['score'], ascending=True)
df['color']='gray'
df.loc[df.file_label.str.contains(searchterm), 'color'] = 'green'
if(len(df[df.color == 'green']) == nof):
df.loc[(df.color!='green'), 'color'] = 'gray'
plt= df.plot.barh(x='filename', y='score', figsize=(10,7), fontsize=16, color=list(df['color']))
plt.set_xlabel('Search term:' + searchterm, fontsize=15)
plt.axis(xmin=0.5, xmax=1.01)
In [6]:
S1('h3k4me2', distance)
Scenario 2: return a list of most similar labels and their similarity score to a query region set (r2l)¶
In [7]:
# Print a sample of filenames
print(list(set(distance.filename))[0:4])
['gse124683/gsm3540312_k562_h3k4me2_rep1_20181202_icell8_21.bed.gz', 'gse175750/gsm5345532_kdm4a-oe.h3k9ac.ls.rep2.bed.gz', 'gse161624/gsm4911338_etv6_ncoa2_tm_hcd34_h3k27ac_vs_total_peaks.narrowpeak.gz', 'gse124690/gsm3540920_k562_h3k4me2_rep2_20181202_icell8_215.bed.gz']
In [8]:
def S2(file, distance):
df = distance[distance.filename == file].sort_values(by=['score'], ascending = False)[0:10]
df= df.sort_values(by=['score'], ascending = True)
df['color']='green'
plt= df.plot.barh(x='search_term', y='score', figsize=(8,5), fontsize=16, color=list(df['color']))
plt.set_xticks(np.arange(0.5,1.1, 0.1))
plt.set_ylabel('Similarity', fontsize=15)
plt.set_xlabel(file, fontsize=15)
In [9]:
S2('gse156613/gsm4743940_t52-h3k27ac_peaks.bed.gz', distance)
Scenario 3: return a list of most similar region sets and their similarity scores to a query region set (r2r)¶
In [11]:
file_name = './distance_r2r.csv'
distance_s3 = pd.read_csv(file_name)
distance_s3.score = 1 - distance_s3.score
In [12]:
# print sample query region set
list(set(distance_s3.test_file))[0:10]
Out[12]:
['ENCFF538OAF.bed.gz,h3k4me1', 'ENCFF494ASP.bed.gz,h3k27me3', 'ENCFF292YSJ.bed.gz,h3k4me3', 'ENCFF001WUM.bed.gz,h3k4me3', 'ENCFF701ENX.bed.gz,h3k27ac', 'ENCFF727GSV.bed.gz,h3k27ac', 'ENCFF787BOJ.bed.gz,h3k27me3', 'ENCFF526BJR.bed.gz,h3k27ac', 'ENCFF908TKC.bed.gz,h3k9me3', 'ENCFF099WAJ.bed.gz,h4k20me1']
In [13]:
def S3(query_file, distance_s3):
df = distance_s3[distance_s3.test_file==query_file].sort_values(by ='score', ascending = False)[['test_file', 'train_file', 'score']]
df['label_test'] = df.test_file.str.split(',', expand = True)[1]
df['label_train'] = df.train_file.str.split(',', expand = True)[1]
nof = len(df[df.label_test==df.label_train])
df = df[0:10]
df=df.sort_values(by=['score'])
df['color']='gray'
df.loc[df.label_test==df.label_train, 'color'] = 'green'
if(len(df[df.color=='green']) ==nof):
df.loc[(df.color!='green'), 'color'] = 'gray'
plt= df.plot.barh(x='train_file', y='score', figsize=(10,7), fontsize=16, color=list(df['color']))
plt.axis(xmin=0.7,xmax=1.01)
plt.set_ylabel('Similarity', fontsize=15)
plt.set_xlabel(query_file, fontsize=15)
In [16]:
S3('ENCFF168GCU.bed.gz,h3k4me1', distance_s3)