import pymc3 as pm
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import arviz as az
import scipy.stats as stats

%config InlineBackend.figure_format='retina'


pm.__version__

'3.8'


dataX = pd.read_csv('data/MAT110Survey/110Survey_fixed.csv', index_col=0)
dataX.columns=['sex',"height","weight",'shoe','tv',"soda",'bmi']


data = dataX[dataX.sex=="F"]
acceptable_heights = data[(data.height>=data.height.quantile(0.03))&(data.height<=data.height.quantile(0.97))].height.unique()
data = data[data.height.isin(acceptable_heights)]


plt.figure(figsize=(13,6))
plt.scatter(data.weight,data.height)
plt.show()


with pm.Model() as Males3:
    slope = pm.Normal('slope', mu=0, sd=0.5)
    intercept = pm.Normal('intercept', mu=60, sd= 5)
    
    height_est = slope*data.weight + intercept
    
    heights_obs = pm.Normal("height", mu= height_est, sd= data.height.std(), observed=data.height)
    
    trace=pm.sample(2000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, slope]
Sampling 4 chains, 0 divergences: 100%|██████████| 10000/10000 [00:17<00:00, 566.54draws/s]
The acceptance probability does not match the target. It is 0.8927158597767962, but should be close to 0.8. Try to increase the number of tuning steps.
The acceptance probability does not match the target. It is 0.9017837942322964, but should be close to 0.8. Try to increase the number of tuning steps.
The acceptance probability does not match the target. It is 0.8820517695948147, but should be close to 0.8. Try to increase the number of tuning steps.
The number of effective samples is smaller than 25% for some parameters.


with Males3:
    az.plot_trace(trace, figsize=(26,12))
    print(az.summary(trace))

             mean     sd  hdi_3%  hdi_97%  mcse_mean  mcse_sd  ess_mean  \
slope       0.026  0.003   0.020    0.032      0.000    0.000    1485.0   
intercept  60.853  0.490  59.964   61.788      0.013    0.009    1471.0   

           ess_sd  ess_bulk  ess_tail  r_hat  
slope      1485.0    1470.0    1804.0    1.0  
intercept  1468.0    1461.0    1764.0    1.0


df_trace_males3 = pm.trace_to_dataframe(trace)
samples= np.random.choice(len(df_trace_males3),50, replace=False)

plt.figure(figsize=(13,6))
plt.scatter(data.weight,data.height,alpha=0.4, label='points')

for c in samples:
    plt.plot(data.weight, df_trace_males3.iloc[c].intercept + df_trace_males3.iloc[c].slope*data.weight, alpha=0.1, c='black')



plt.plot(data.weight, df_trace_males3.intercept.mean() + df_trace_males3.slope.mean()*data.weight, c='r', label='projected regression')    
plt.legend()
plt.show()


N = 100
b = stats.norm.rvs(60,5,N)
m = stats.norm.rvs(0,0.5,N)


x= np.linspace(100,300,250)

plt.figure(figsize=(13,6))
for i in range(N):
    plt.plot(x, b[i]+m[i]*x, alpha=0.3,c= 'black')
    
plt.hlines(107, 100,300, 'red')
plt.hlines(22, 100,300,'red')

plt.ylim(0,130)
plt.show()


N = 100
b = stats.norm.rvs(60,5,N)
m = stats.lognorm.rvs(s=1,scale=0.05,size=N)

x= np.linspace(100,300,250)

plt.figure(figsize=(13,6))
for i in range(N):
    plt.plot(x, b[i]+m[i]*x, alpha=0.3,c= 'black')
    
plt.hlines(107, 100,300, 'red')
plt.hlines(22, 100,300,'red')

plt.ylim(0,130)
plt.show()


with pm.Model() as Males3:
    slope = pm.Lognormal('slope', mu=0, sd=0.5)
    intercept = pm.Normal('intercept', mu=60, sd= 5)
    
    height_est = slope*data.weight + intercept
    
    heights_obs = pm.Normal("height", mu= height_est, sd= data.height.std(), observed=data.height)
    
    trace=pm.sample(2000)
    
    az.plot_trace(trace, figsize=(26,12))
    print(az.summary(trace))

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, slope]
Sampling 4 chains, 0 divergences: 100%|██████████| 10000/10000 [00:13<00:00, 746.51draws/s]
The acceptance probability does not match the target. It is 0.9091941357405479, but should be close to 0.8. Try to increase the number of tuning steps.
The acceptance probability does not match the target. It is 0.8845536326953043, but should be close to 0.8. Try to increase the number of tuning steps.
The acceptance probability does not match the target. It is 0.891706913071925, but should be close to 0.8. Try to increase the number of tuning steps.
The number of effective samples is smaller than 25% for some parameters.

             mean     sd  hdi_3%  hdi_97%  mcse_mean  mcse_sd  ess_mean  \
intercept  60.157  0.458  59.342   61.077      0.012    0.009    1367.0   
slope       0.031  0.003   0.025    0.037      0.000    0.000    1366.0   

           ess_sd  ess_bulk  ess_tail  r_hat  
intercept  1367.0    1379.0    1287.0    1.0  
slope      1342.0    1375.0    1278.0    1.0


df_trace_males3 = pm.trace_to_dataframe(trace)
samples= np.random.choice(len(df_trace_males3),50, replace=False)

plt.figure(figsize=(13,6))
plt.scatter(data.weight,data.height,alpha=0.4, label='points')

for c in samples:
    plt.plot(data.weight, df_trace_males3.iloc[c].intercept + df_trace_males3.iloc[c].slope*data.weight, alpha=0.1, c='black')



plt.plot(data.weight, df_trace_males3.intercept.mean() + df_trace_males3.slope.mean()*data.weight, c='r', label='projected regression')    
plt.legend()
plt.show()


height_predicitive= pm.sample_posterior_predictive(trace,50, Males3)

/opt/anaconda3/envs/pymc3_3_6/lib/python3.6/site-packages/pymc3/sampling.py:1247: UserWarning: samples parameter is smaller than nchains times ndraws, some draws and/or chains may not be represented in the returned posterior predictive sample
  "samples parameter is smaller than nchains times ndraws, some draws "
100%|██████████| 50/50 [00:00<00:00, 218.28it/s]


height_predicitive_hdi = az.hdi(height_predicitive['height'])

/opt/anaconda3/envs/pymc3_3_6/lib/python3.6/site-packages/arviz/stats/stats.py:487: FutureWarning: hdi currently interprets 2d data as (draw, shape) but this will change in a future release to (chain, draw) for coherence with other functions
  FutureWarning,


az.plot_hdi(data.weight,height_predicitive['height'], figsize=(13,6))
plt.scatter(data.weight,data.height, alpha=0.4)

<matplotlib.collections.PathCollection at 0x7f9cd09c60f0>


df_trace_males3 = pm.trace_to_dataframe(trace)
samples= np.random.choice(len(df_trace_males3),50, replace=False)

az.plot_hdi(data.weight,height_predicitive['height'], figsize=(13,6))

plt.scatter(data.weight,data.height,alpha=0.4, label='points')

for c in samples:
    plt.plot(data.weight, df_trace_males3.iloc[c].intercept + df_trace_males3.iloc[c].slope*data.weight, alpha=0.1, c='black')



plt.plot(data.weight, df_trace_males3.intercept.mean() + df_trace_males3.slope.mean()*data.weight, c='r', label='projected regression')    
plt.legend()
plt.show()

/opt/anaconda3/envs/pymc3_3_6/lib/python3.6/site-packages/arviz/stats/stats.py:487: FutureWarning: hdi currently interprets 2d data as (draw, shape) but this will change in a future release to (chain, draw) for coherence with other functions
  FutureWarning,


dataX = pd.read_csv('data/MAT110Survey/110Survey_fixed.csv', index_col=0)
dataX.columns=['sex',"height","weight",'shoe','tv',"soda",'bmi']


dataX['w_score']= dataX.apply(lambda x: (x.weight - dataX.weight.mean())/dataX.weight.std(), axis=1)
dataX['w_score2'] = dataX.w_score**2


dataX.head()


plt.figure(figsize=(13,6))
plt.scatter(dataX[dataX.sex=='M'].w_score,dataX[dataX.sex=='M'].height, alpha=0.24, label='Males')
plt.scatter(dataX[dataX.sex=='F'].w_score,dataX[dataX.sex=='F'].height, alpha=0.24,label='Females')
plt.legend()
plt.show()

plt.figure(figsize=(13,6))
plt.scatter(dataX[dataX.sex=='M'].w_score2,dataX[dataX.sex=='M'].height, alpha=0.24, label='Males')
plt.scatter(dataX[dataX.sex=='F'].w_score2,dataX[dataX.sex=='F'].height, alpha=0.24,label='Females')
plt.xlim(0,5)
plt.legend()
plt.show()


data=dataX[dataX.sex=='M']

with pm.Model() as Males4:
    slope = pm.Lognormal('slope', mu=4, sd=3)
    intercept = pm.Normal('intercept', mu=60, sd= 5)
    b2=pm.Normal('b2', mu=4, sd= 3)
    
    height_est = b2*data.w_score2+slope*data.w_score + intercept
    
    heights_obs = pm.Normal("height", mu= height_est, sd= data.height.std(), observed=data.height)
    
    trace4=pm.sample(2000)
    
    az.plot_trace(trace4, figsize=(26,12))
    print(az.summary(trace4))

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [b2, intercept, slope]
Sampling 4 chains, 0 divergences: 100%|██████████| 10000/10000 [00:10<00:00, 954.56draws/s]
The acceptance probability does not match the target. It is 0.8816433351190381, but should be close to 0.8. Try to increase the number of tuning steps.

             mean     sd  hdi_3%  hdi_97%  mcse_mean  mcse_sd  ess_mean  \
intercept  69.819  0.168  69.512   70.138      0.003    0.002    4478.0   
b2         -0.588  0.108  -0.782   -0.377      0.002    0.001    3765.0   
slope       3.022  0.246   2.529    3.459      0.004    0.003    3619.0   

           ess_sd  ess_bulk  ess_tail  r_hat  
intercept  4478.0    4483.0    4342.0    1.0  
b2         3765.0    3766.0    3754.0    1.0  
slope      3619.0    3638.0    3287.0    1.0


height_predicitive= pm.sample_posterior_predictive(trace4,50, Males4)
df_trace_males4 = pm.trace_to_dataframe(trace4)
samples= np.random.choice(len(df_trace_males4),50, replace=False)

az.plot_hdi(data.w_score,height_predicitive['height'], figsize=(13,6))

plt.scatter(data.w_score,data.height,alpha=0.4, label='points')

for c in samples:
    plt.plot(data.w_score, df_trace_males4.iloc[c].intercept + df_trace_males4.iloc[c].slope*data.w_score + df_trace_males4.iloc[c].b2*data.w_score2, '.',alpha=0.1, c='black')



plt.plot(data.w_score, df_trace_males4.intercept.mean() + df_trace_males4.slope.mean()*data.w_score + df_trace_males4.b2.mean()*data.w_score**2,'.', c='r', label='projected regression')    
plt.legend()
plt.show()

/opt/anaconda3/envs/pymc3_3_6/lib/python3.6/site-packages/pymc3/sampling.py:1247: UserWarning: samples parameter is smaller than nchains times ndraws, some draws and/or chains may not be represented in the returned posterior predictive sample
  "samples parameter is smaller than nchains times ndraws, some draws "
100%|██████████| 50/50 [00:00<00:00, 376.14it/s]
/opt/anaconda3/envs/pymc3_3_6/lib/python3.6/site-packages/arviz/stats/stats.py:487: FutureWarning: hdi currently interprets 2d data as (draw, shape) but this will change in a future release to (chain, draw) for coherence with other functions
  FutureWarning,


data=dataX[dataX.sex=='F']

with pm.Model() as Males4:
    slope = pm.Lognormal('slope', mu=4, sd=3)
    intercept = pm.Normal('intercept', mu=60, sd= 5)
    b2=pm.Normal('b2', mu=4, sd= 3)
    
    height_est = b2*data.w_score2+slope*data.w_score + intercept
    
    heights_obs = pm.Normal("height", mu= height_est, sd= data.height.std(), observed=data.height)
    
    trace4=pm.sample(2000)
    
    az.plot_trace(trace4, figsize=(26,12))
    print(az.summary(trace4))
    plt.show()
    
height_predicitive= pm.sample_posterior_predictive(trace4,50, Males4)
df_trace_males4 = pm.trace_to_dataframe(trace4)
samples= np.random.choice(len(df_trace_males4),50, replace=False)

az.plot_hdi(data.w_score,height_predicitive['height'], figsize=(13,6))

plt.scatter(data.w_score,data.height,alpha=0.4, label='points')

for c in samples:
    plt.plot(data.w_score, df_trace_males4.iloc[c].intercept + df_trace_males4.iloc[c].slope*data.w_score + df_trace_males4.iloc[c].b2*data.w_score2, '.',alpha=0.1, c='black')



plt.plot(data.w_score, df_trace_males4.intercept.mean() + df_trace_males4.slope.mean()*data.w_score + df_trace_males4.b2.mean()*data.w_score**2,'.', c='r', label='projected regression')    
plt.legend()
plt.show()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [b2, intercept, slope]
Sampling 4 chains, 0 divergences: 100%|██████████| 10000/10000 [00:08<00:00, 1182.60draws/s]
The acceptance probability does not match the target. It is 0.880619118080398, but should be close to 0.8. Try to increase the number of tuning steps.

             mean     sd  hdi_3%  hdi_97%  mcse_mean  mcse_sd  ess_mean  \
intercept  65.797  0.189  65.462   66.175      0.003    0.002    2998.0   
b2         -0.403  0.066  -0.530   -0.280      0.001    0.001    2979.0   
slope       1.935  0.198   1.550    2.292      0.003    0.002    3292.0   

           ess_sd  ess_bulk  ess_tail  r_hat  
intercept  2998.0    3001.0    4081.0    1.0  
b2         2979.0    2971.0    4289.0    1.0  
slope      3292.0    3288.0    4144.0    1.0

/opt/anaconda3/envs/pymc3_3_6/lib/python3.6/site-packages/pymc3/sampling.py:1247: UserWarning: samples parameter is smaller than nchains times ndraws, some draws and/or chains may not be represented in the returned posterior predictive sample
  "samples parameter is smaller than nchains times ndraws, some draws "
100%|██████████| 50/50 [00:00<00:00, 248.66it/s]
/opt/anaconda3/envs/pymc3_3_6/lib/python3.6/site-packages/arviz/stats/stats.py:487: FutureWarning: hdi currently interprets 2d data as (draw, shape) but this will change in a future release to (chain, draw) for coherence with other functions
  FutureWarning,


dataX.head()


plt.figure(figsize=(13,6))
plt.scatter(dataX[dataX.sex=='M'].w_score,dataX[dataX.sex=='M'].bmi, alpha=0.24, label='Males')
plt.scatter(dataX[dataX.sex=='F'].w_score,dataX[dataX.sex=='F'].bmi, alpha=0.24,label='Females')
plt.legend()
plt.show()

plt.figure(figsize=(13,6))
plt.scatter(dataX[dataX.sex=='M'].w_score2,dataX[dataX.sex=='M'].bmi, alpha=0.24, label='Males')
plt.scatter(dataX[dataX.sex=='F'].w_score2,dataX[dataX.sex=='F'].bmi, alpha=0.24,label='Females')
plt.xlim(0,5)
plt.legend()
plt.show()


dataX.head()


dataX['Mnum'] = dataX.sex.apply(lambda x: 1 if x=="M" else 0)


dataX.head(7)


with pm.Model() as modelX:
    pm.glm.GLM.from_formula('Mnum ~ height + w_score', dataX, family=pm.glm.families.Binomial())
    
    traceX = pm.sample(2000, tune = 1000, init='adapt_diag')

Auto-assigning NUTS sampler...
Initializing NUTS using adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [w_score, height, Intercept]
Sampling 4 chains, 0 divergences: 100%|██████████| 12000/12000 [03:17<00:00, 60.84draws/s]
The number of effective samples is smaller than 25% for some parameters.


az.plot_trace(traceX, figsize=(26,12))
print(az.summary(traceX))

/opt/anaconda3/envs/pymc3_3_6/lib/python3.6/site-packages/arviz/data/io_pymc3.py:91: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context.
  FutureWarning,
/opt/anaconda3/envs/pymc3_3_6/lib/python3.6/site-packages/arviz/data/io_pymc3.py:91: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context.
  FutureWarning,

             mean     sd  hdi_3%  hdi_97%  mcse_mean  mcse_sd  ess_mean  \
Intercept -25.022  1.617 -28.124  -22.087      0.038    0.027    1855.0   
height      0.371  0.024   0.326    0.415      0.001    0.000    1858.0   
w_score     1.086  0.108   0.890    1.293      0.002    0.001    2894.0   

           ess_sd  ess_bulk  ess_tail  r_hat  
Intercept  1855.0    1850.0    2299.0    1.0  
height     1858.0    1853.0    2355.0    1.0  
w_score    2870.0    2896.0    2985.0    1.0


az.plot_kde(traceX['height'],traceX['w_score'], figsize=(13,6))

<AxesSubplot:>


sns.jointplot(traceX['height'],traceX['w_score'], kind='hex')

/opt/anaconda3/envs/pymc3_3_6/lib/python3.6/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning

<seaborn.axisgrid.JointGrid at 0x7f9c9227e550>


traceX.height.mean()

0.37101288563621493


bob = traceX.height.mean()*70+traceX.w_score.mean()*2+traceX.Intercept.mean()


np.exp(bob)/(1+np.exp(bob))

0.9577086144859366


def predict_sex(w,h):
    wZ = (w-dataX.weight.mean())/dataX.weight.std()
    v = traceX.height.mean()*h+traceX.w_score.mean()*wZ+traceX.Intercept.mean()
    return np.exp(v)/(1+np.exp(v))


dataX['prediction'] = dataX.apply(lambda x: predict_sex(x.weight,x.height), axis =1)


xs = np.linspace(dataX.w_score.min(),dataX.w_score.max(), 200)
ys = np.linspace(dataX.height.min(),dataX.height.max(), 200)
X,Y = np.meshgrid(xs,ys)

Z = np.exp(traceX.height.mean()*Y+traceX.w_score.mean()*X+traceX.Intercept.mean())/(1+np.exp(traceX.height.mean()*Y+traceX.w_score.mean()*X+traceX.Intercept.mean()))


plt.figure(figsize=(13,6))
plt.contour(X,Y,Z, levels=[0.5])
plt.scatter(dataX[dataX.sex=='M'].w_score,dataX[dataX.sex=='M'].height, alpha=0.24, label='Males')
plt.scatter(dataX[dataX.sex=='F'].w_score,dataX[dataX.sex=='F'].height, alpha=0.24,label='Females')
plt.xlim(-1.5,2)
plt.legend()
plt.show()


ps = predict_sex(dataX.weight,dataX.height)
errs = ((ps<0.5)&dataX.Mnum)|((ps>=0.5)&(1-dataX.Mnum))


plt.figure(figsize=(13,6))
plt.contour(X,Y,Z, levels=[0.5])
plt.scatter(dataX.w_score[errs],dataX.height[errs], facecolors='red',s=120, alpha=0.4)
plt.scatter(dataX[dataX.sex=='M'].w_score,dataX[dataX.sex=='M'].height, alpha=0.24, label='Males')
plt.scatter(dataX[dataX.sex=='F'].w_score,dataX[dataX.sex=='F'].height, alpha=0.24,label='Females')
#plt.xlim(-1.5,2)
plt.legend()
plt.show()


errs.sum()/len(errs)

0.15151515151515152


predict_sex(200,72)

0.9273834767034302

	sex	height	weight	shoe	tv	soda	bmi	w_score	w_score2
0	M	71.0	162.0	10.0	35.0	7.0	22.591946	-0.060349	0.003642
1	M	67.0	140.0	11.0	7.0	28.0	21.924705	-0.551975	0.304676
2	M	75.0	177.0	12.0	3.0	3.0	22.121067	0.274850	0.075542
3	M	72.0	210.0	11.0	11.0	14.0	28.478009	1.012288	1.024726
4	M	72.0	180.0	11.0	20.0	15.0	24.409722	0.341889	0.116888

	sex	height	weight	shoe	tv	soda	bmi	w_score	w_score2
0	M	71.0	162.0	10.0	35.0	7.0	22.591946	-0.060349	0.003642
1	M	67.0	140.0	11.0	7.0	28.0	21.924705	-0.551975	0.304676
2	M	75.0	177.0	12.0	3.0	3.0	22.121067	0.274850	0.075542
3	M	72.0	210.0	11.0	11.0	14.0	28.478009	1.012288	1.024726
4	M	72.0	180.0	11.0	20.0	15.0	24.409722	0.341889	0.116888

	sex	height	weight	shoe	tv	soda	bmi	w_score	w_score2
0	M	71.0	162.0	10.0	35.0	7.0	22.591946	-0.060349	0.003642
1	M	67.0	140.0	11.0	7.0	28.0	21.924705	-0.551975	0.304676
2	M	75.0	177.0	12.0	3.0	3.0	22.121067	0.274850	0.075542
3	M	72.0	210.0	11.0	11.0	14.0	28.478009	1.012288	1.024726
4	M	72.0	180.0	11.0	20.0	15.0	24.409722	0.341889	0.116888

	sex	height	weight	shoe	tv	soda	bmi	w_score	w_score2	Mnum
0	M	71.0	162.0	10.0	35.0	7.0	22.591946	-0.060349	0.003642	1
1	M	67.0	140.0	11.0	7.0	28.0	21.924705	-0.551975	0.304676	1
2	M	75.0	177.0	12.0	3.0	3.0	22.121067	0.274850	0.075542	1
3	M	72.0	210.0	11.0	11.0	14.0	28.478009	1.012288	1.024726	1
4	M	72.0	180.0	11.0	20.0	15.0	24.409722	0.341889	0.116888	1
5	F	69.0	136.0	9.0	63.0	21.0	20.081495	-0.641361	0.411344	0
6	F	70.0	162.0	10.0	1.0	0.0	23.242041	-0.060349	0.003642	0

MAT411: Bayesian Data Analysis¶

Week 11¶

More GLM and some classifications¶

Lets look at our data set again¶

can we make this slicker¶

Lets look at a different relationship¶

Lets talk about classifications¶

How to convert this score into a probability?¶