import pandas as pd
import utilities as UT
import numpy as np
import seaborn as sns
import os
#UT.clean_df?
df = pd.read_csv('proteinGroups.txt',sep='\t')
df=UT.clean_df(df,score=5)
df = UT.mod_df(df)
df.head()
col = ['iBAQ ZJ_{id}'.format(id=n) for n in range(1,9)]
selection = df[col]
print(selection.shape)
selection.head()
#list(df.columns)
##selection = df[col].replace(0,np.nan)
#selection = selection.fillna(selection.min())
#selection.head()
selection = selection.replace(0,np.nan).dropna(how='all')
import missingno as msno
import numpy as np
import matplotlib.pyplot as plt
#visualization of missing data
ax=msno.bar(selection.replace(0,np.nan),figsize=(6, 6))
plt.title('Missing Data Analysis',size=12)
ax.set_ylabel('Fraction of data points',size=12)
plt.tight_layout()
plt.savefig('missing.png')
plt.show()
#color palette
palette=['b','b','b','b','b','b','b','b']
palette_g = ['b']
color_dictionary = { 'b':'-'}
plt.style.use('ggplot')
fig,axes=plt.subplots(ncols=2,nrows=1,figsize=(14,4))
np.log10(selection.replace(0,np.nan)).plot(
kind='kde', color=palette, alpha=0.5,ax=axes[0])
axes[0].legend(loc='center left', bbox_to_anchor=(0.7, 0.5))
axes[0].set_title('Value Distribution')
axes[0].set_xlabel('Log10 Intensity')
sns.boxplot(data =np.log10(selection.replace(0,np.nan)),
showfliers=False,palette=palette,ax=axes[1])
plt.title('Value Distribution')
plt.xlabel('Log10 Intensity')
plt.show()
comparisons= [1,3,7]
for n in comparisons:
bait_col = 'iBAQ ZJ_{id}'.format(id=n)
control_col = 'iBAQ ZJ_5'
df['log10sum_{}'.format(n)]=np.log10(df[bait_col].replace(0,np.nan)
.fillna(df[bait_col]
.replace(0,np.nan).min())+df[control_col]
.replace(0,np.nan)
.fillna(df[control_col].replace(0,np.nan).min()))
df['log2fc_{}'.format(n)]=np.log2(df[bait_col]
.replace(0,np.nan)
.fillna(df[bait_col]
.replace(0,np.nan).min()) /
df[control_col]
.replace(0,np.nan)
.fillna(df[control_col].replace(0,np.nan).min()))
df.head()
df.loc[132]
df.loc[selection.index.values].iloc[:,-8:].reset_index(drop=True)
.replace(-np.inf,0).round(2).to_csv('indata.csv')
fontProperties = {'family':'Arial',
'weight' : 'normal', 'size' : 7}
fontProperties_names = {'family':'Arial',
'weight' : 'bold', 'size' : 7}
df['desc2']=df['desc']+' '+df['Gene_id']
import plotly.express as px
import plotly
fig = px.scatter(df,x='log2fc_1',y='log10sum_1',
hover_name='Gene_id',hover_data=['desc'],title='1 vs 5')
plotly.offline.plot(fig, filename='1vs5.html')
fig = px.scatter(df,x='log2fc_3',y='log10sum_3',hover_name='Gene_id',
hover_data=['desc'],title='3 vs 5')
plotly.offline.plot(fig, filename='3vs5.html')
fig = px.scatter(df,x='log2fc_7',y='log10sum_7',
hover_name='Gene_id',hover_data=['desc'],title='7 vs 5')
plotly.offline.plot(fig, filename='7vs5.html')
from IPython.display import IFrame
IFrame(src='1vs5.html', width=700, height=600)
from IPython.display import IFrame
IFrame(src='3vs5.html', width=700, height=600)
from IPython.display import IFrame
IFrame(src='7vs5.html', width=700, height=600)
plt.style.use('default')
fig, axes=plt.subplots(figsize=(4,8),
ncols=1, nrows=1)
ids = df[(df['log10sum_1']>5) & (df['log2fc_1']>6)]
_ids = ids.index.values
_names = ids['Gene_id'].values
print(len(_ids))
print(len(_names))
ax=axes
UT.make_vulcano(df, ax, x='log2fc_1', y='log10sum_1',
annot_index=_ids,
annot_names = _names,
title='',
#fc_limit=70,
fc_col='log2fc_1',
alpha_main=0.3,
point_size_selection=5,
point_size_all=5,
text_size=7,
fontdict=fontProperties_names
)
!ipython nbconvert --to HTML make_dataset.ipynb