import pymc3 as pm
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import arviz as az
import scipy.stats as stats
%config InlineBackend.figure_format='retina'
pm.__version__
'3.8'
dataX = pd.read_csv('data/MAT110Survey/110Survey_fixed.csv', index_col=0)
dataX.columns=['sex',"height","weight",'shoe','tv',"soda",'bmi']
data = dataX[dataX.sex=="F"]
acceptable_heights = data[(data.height>=data.height.quantile(0.03))&(data.height<=data.height.quantile(0.97))].height.unique()
data = data[data.height.isin(acceptable_heights)]
plt.figure(figsize=(13,6))
plt.scatter(data.weight,data.height)
plt.show()
with pm.Model() as Males3:
slope = pm.Normal('slope', mu=0, sd=0.5)
intercept = pm.Normal('intercept', mu=60, sd= 5)
height_est = slope*data.weight + intercept
heights_obs = pm.Normal("height", mu= height_est, sd= data.height.std(), observed=data.height)
trace=pm.sample(2000)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [intercept, slope] Sampling 4 chains, 0 divergences: 100%|██████████| 10000/10000 [00:17<00:00, 566.54draws/s] The acceptance probability does not match the target. It is 0.8927158597767962, but should be close to 0.8. Try to increase the number of tuning steps. The acceptance probability does not match the target. It is 0.9017837942322964, but should be close to 0.8. Try to increase the number of tuning steps. The acceptance probability does not match the target. It is 0.8820517695948147, but should be close to 0.8. Try to increase the number of tuning steps. The number of effective samples is smaller than 25% for some parameters.
with Males3:
az.plot_trace(trace, figsize=(26,12))
print(az.summary(trace))
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_mean \ slope 0.026 0.003 0.020 0.032 0.000 0.000 1485.0 intercept 60.853 0.490 59.964 61.788 0.013 0.009 1471.0 ess_sd ess_bulk ess_tail r_hat slope 1485.0 1470.0 1804.0 1.0 intercept 1468.0 1461.0 1764.0 1.0
df_trace_males3 = pm.trace_to_dataframe(trace)
samples= np.random.choice(len(df_trace_males3),50, replace=False)
plt.figure(figsize=(13,6))
plt.scatter(data.weight,data.height,alpha=0.4, label='points')
for c in samples:
plt.plot(data.weight, df_trace_males3.iloc[c].intercept + df_trace_males3.iloc[c].slope*data.weight, alpha=0.1, c='black')
plt.plot(data.weight, df_trace_males3.intercept.mean() + df_trace_males3.slope.mean()*data.weight, c='r', label='projected regression')
plt.legend()
plt.show()
how does this thing sample.
we know that the sample lines are $y=mx+b$
N = 100
b = stats.norm.rvs(60,5,N)
m = stats.norm.rvs(0,0.5,N)
x= np.linspace(100,300,250)
plt.figure(figsize=(13,6))
for i in range(N):
plt.plot(x, b[i]+m[i]*x, alpha=0.3,c= 'black')
plt.hlines(107, 100,300, 'red')
plt.hlines(22, 100,300,'red')
plt.ylim(0,130)
plt.show()
We can restrict the slopes to a better range
N = 100
b = stats.norm.rvs(60,5,N)
m = stats.lognorm.rvs(s=1,scale=0.05,size=N)
x= np.linspace(100,300,250)
plt.figure(figsize=(13,6))
for i in range(N):
plt.plot(x, b[i]+m[i]*x, alpha=0.3,c= 'black')
plt.hlines(107, 100,300, 'red')
plt.hlines(22, 100,300,'red')
plt.ylim(0,130)
plt.show()
with pm.Model() as Males3:
slope = pm.Lognormal('slope', mu=0, sd=0.5)
intercept = pm.Normal('intercept', mu=60, sd= 5)
height_est = slope*data.weight + intercept
heights_obs = pm.Normal("height", mu= height_est, sd= data.height.std(), observed=data.height)
trace=pm.sample(2000)
az.plot_trace(trace, figsize=(26,12))
print(az.summary(trace))
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [intercept, slope] Sampling 4 chains, 0 divergences: 100%|██████████| 10000/10000 [00:13<00:00, 746.51draws/s] The acceptance probability does not match the target. It is 0.9091941357405479, but should be close to 0.8. Try to increase the number of tuning steps. The acceptance probability does not match the target. It is 0.8845536326953043, but should be close to 0.8. Try to increase the number of tuning steps. The acceptance probability does not match the target. It is 0.891706913071925, but should be close to 0.8. Try to increase the number of tuning steps. The number of effective samples is smaller than 25% for some parameters.
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_mean \ intercept 60.157 0.458 59.342 61.077 0.012 0.009 1367.0 slope 0.031 0.003 0.025 0.037 0.000 0.000 1366.0 ess_sd ess_bulk ess_tail r_hat intercept 1367.0 1379.0 1287.0 1.0 slope 1342.0 1375.0 1278.0 1.0
df_trace_males3 = pm.trace_to_dataframe(trace)
samples= np.random.choice(len(df_trace_males3),50, replace=False)
plt.figure(figsize=(13,6))
plt.scatter(data.weight,data.height,alpha=0.4, label='points')
for c in samples:
plt.plot(data.weight, df_trace_males3.iloc[c].intercept + df_trace_males3.iloc[c].slope*data.weight, alpha=0.1, c='black')
plt.plot(data.weight, df_trace_males3.intercept.mean() + df_trace_males3.slope.mean()*data.weight, c='r', label='projected regression')
plt.legend()
plt.show()
height_predicitive= pm.sample_posterior_predictive(trace,50, Males3)
/opt/anaconda3/envs/pymc3_3_6/lib/python3.6/site-packages/pymc3/sampling.py:1247: UserWarning: samples parameter is smaller than nchains times ndraws, some draws and/or chains may not be represented in the returned posterior predictive sample "samples parameter is smaller than nchains times ndraws, some draws " 100%|██████████| 50/50 [00:00<00:00, 218.28it/s]
height_predicitive_hdi = az.hdi(height_predicitive['height'])
/opt/anaconda3/envs/pymc3_3_6/lib/python3.6/site-packages/arviz/stats/stats.py:487: FutureWarning: hdi currently interprets 2d data as (draw, shape) but this will change in a future release to (chain, draw) for coherence with other functions FutureWarning,
az.plot_hdi(data.weight,height_predicitive['height'], figsize=(13,6))
plt.scatter(data.weight,data.height, alpha=0.4)
<matplotlib.collections.PathCollection at 0x7f9cd09c60f0>
df_trace_males3 = pm.trace_to_dataframe(trace)
samples= np.random.choice(len(df_trace_males3),50, replace=False)
az.plot_hdi(data.weight,height_predicitive['height'], figsize=(13,6))
plt.scatter(data.weight,data.height,alpha=0.4, label='points')
for c in samples:
plt.plot(data.weight, df_trace_males3.iloc[c].intercept + df_trace_males3.iloc[c].slope*data.weight, alpha=0.1, c='black')
plt.plot(data.weight, df_trace_males3.intercept.mean() + df_trace_males3.slope.mean()*data.weight, c='r', label='projected regression')
plt.legend()
plt.show()
/opt/anaconda3/envs/pymc3_3_6/lib/python3.6/site-packages/arviz/stats/stats.py:487: FutureWarning: hdi currently interprets 2d data as (draw, shape) but this will change in a future release to (chain, draw) for coherence with other functions FutureWarning,
dataX = pd.read_csv('data/MAT110Survey/110Survey_fixed.csv', index_col=0)
dataX.columns=['sex',"height","weight",'shoe','tv',"soda",'bmi']
dataX['w_score']= dataX.apply(lambda x: (x.weight - dataX.weight.mean())/dataX.weight.std(), axis=1)
dataX['w_score2'] = dataX.w_score**2
dataX.head()
sex | height | weight | shoe | tv | soda | bmi | w_score | w_score2 | |
---|---|---|---|---|---|---|---|---|---|
0 | M | 71.0 | 162.0 | 10.0 | 35.0 | 7.0 | 22.591946 | -0.060349 | 0.003642 |
1 | M | 67.0 | 140.0 | 11.0 | 7.0 | 28.0 | 21.924705 | -0.551975 | 0.304676 |
2 | M | 75.0 | 177.0 | 12.0 | 3.0 | 3.0 | 22.121067 | 0.274850 | 0.075542 |
3 | M | 72.0 | 210.0 | 11.0 | 11.0 | 14.0 | 28.478009 | 1.012288 | 1.024726 |
4 | M | 72.0 | 180.0 | 11.0 | 20.0 | 15.0 | 24.409722 | 0.341889 | 0.116888 |
plt.figure(figsize=(13,6))
plt.scatter(dataX[dataX.sex=='M'].w_score,dataX[dataX.sex=='M'].height, alpha=0.24, label='Males')
plt.scatter(dataX[dataX.sex=='F'].w_score,dataX[dataX.sex=='F'].height, alpha=0.24,label='Females')
plt.legend()
plt.show()
plt.figure(figsize=(13,6))
plt.scatter(dataX[dataX.sex=='M'].w_score2,dataX[dataX.sex=='M'].height, alpha=0.24, label='Males')
plt.scatter(dataX[dataX.sex=='F'].w_score2,dataX[dataX.sex=='F'].height, alpha=0.24,label='Females')
plt.xlim(0,5)
plt.legend()
plt.show()
data=dataX[dataX.sex=='M']
with pm.Model() as Males4:
slope = pm.Lognormal('slope', mu=4, sd=3)
intercept = pm.Normal('intercept', mu=60, sd= 5)
b2=pm.Normal('b2', mu=4, sd= 3)
height_est = b2*data.w_score2+slope*data.w_score + intercept
heights_obs = pm.Normal("height", mu= height_est, sd= data.height.std(), observed=data.height)
trace4=pm.sample(2000)
az.plot_trace(trace4, figsize=(26,12))
print(az.summary(trace4))
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [b2, intercept, slope] Sampling 4 chains, 0 divergences: 100%|██████████| 10000/10000 [00:10<00:00, 954.56draws/s] The acceptance probability does not match the target. It is 0.8816433351190381, but should be close to 0.8. Try to increase the number of tuning steps.
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_mean \ intercept 69.819 0.168 69.512 70.138 0.003 0.002 4478.0 b2 -0.588 0.108 -0.782 -0.377 0.002 0.001 3765.0 slope 3.022 0.246 2.529 3.459 0.004 0.003 3619.0 ess_sd ess_bulk ess_tail r_hat intercept 4478.0 4483.0 4342.0 1.0 b2 3765.0 3766.0 3754.0 1.0 slope 3619.0 3638.0 3287.0 1.0
height_predicitive= pm.sample_posterior_predictive(trace4,50, Males4)
df_trace_males4 = pm.trace_to_dataframe(trace4)
samples= np.random.choice(len(df_trace_males4),50, replace=False)
az.plot_hdi(data.w_score,height_predicitive['height'], figsize=(13,6))
plt.scatter(data.w_score,data.height,alpha=0.4, label='points')
for c in samples:
plt.plot(data.w_score, df_trace_males4.iloc[c].intercept + df_trace_males4.iloc[c].slope*data.w_score + df_trace_males4.iloc[c].b2*data.w_score2, '.',alpha=0.1, c='black')
plt.plot(data.w_score, df_trace_males4.intercept.mean() + df_trace_males4.slope.mean()*data.w_score + df_trace_males4.b2.mean()*data.w_score**2,'.', c='r', label='projected regression')
plt.legend()
plt.show()
/opt/anaconda3/envs/pymc3_3_6/lib/python3.6/site-packages/pymc3/sampling.py:1247: UserWarning: samples parameter is smaller than nchains times ndraws, some draws and/or chains may not be represented in the returned posterior predictive sample "samples parameter is smaller than nchains times ndraws, some draws " 100%|██████████| 50/50 [00:00<00:00, 376.14it/s] /opt/anaconda3/envs/pymc3_3_6/lib/python3.6/site-packages/arviz/stats/stats.py:487: FutureWarning: hdi currently interprets 2d data as (draw, shape) but this will change in a future release to (chain, draw) for coherence with other functions FutureWarning,
data=dataX[dataX.sex=='F']
with pm.Model() as Males4:
slope = pm.Lognormal('slope', mu=4, sd=3)
intercept = pm.Normal('intercept', mu=60, sd= 5)
b2=pm.Normal('b2', mu=4, sd= 3)
height_est = b2*data.w_score2+slope*data.w_score + intercept
heights_obs = pm.Normal("height", mu= height_est, sd= data.height.std(), observed=data.height)
trace4=pm.sample(2000)
az.plot_trace(trace4, figsize=(26,12))
print(az.summary(trace4))
plt.show()
height_predicitive= pm.sample_posterior_predictive(trace4,50, Males4)
df_trace_males4 = pm.trace_to_dataframe(trace4)
samples= np.random.choice(len(df_trace_males4),50, replace=False)
az.plot_hdi(data.w_score,height_predicitive['height'], figsize=(13,6))
plt.scatter(data.w_score,data.height,alpha=0.4, label='points')
for c in samples:
plt.plot(data.w_score, df_trace_males4.iloc[c].intercept + df_trace_males4.iloc[c].slope*data.w_score + df_trace_males4.iloc[c].b2*data.w_score2, '.',alpha=0.1, c='black')
plt.plot(data.w_score, df_trace_males4.intercept.mean() + df_trace_males4.slope.mean()*data.w_score + df_trace_males4.b2.mean()*data.w_score**2,'.', c='r', label='projected regression')
plt.legend()
plt.show()
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [b2, intercept, slope] Sampling 4 chains, 0 divergences: 100%|██████████| 10000/10000 [00:08<00:00, 1182.60draws/s] The acceptance probability does not match the target. It is 0.880619118080398, but should be close to 0.8. Try to increase the number of tuning steps.
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_mean \ intercept 65.797 0.189 65.462 66.175 0.003 0.002 2998.0 b2 -0.403 0.066 -0.530 -0.280 0.001 0.001 2979.0 slope 1.935 0.198 1.550 2.292 0.003 0.002 3292.0 ess_sd ess_bulk ess_tail r_hat intercept 2998.0 3001.0 4081.0 1.0 b2 2979.0 2971.0 4289.0 1.0 slope 3292.0 3288.0 4144.0 1.0
/opt/anaconda3/envs/pymc3_3_6/lib/python3.6/site-packages/pymc3/sampling.py:1247: UserWarning: samples parameter is smaller than nchains times ndraws, some draws and/or chains may not be represented in the returned posterior predictive sample "samples parameter is smaller than nchains times ndraws, some draws " 100%|██████████| 50/50 [00:00<00:00, 248.66it/s] /opt/anaconda3/envs/pymc3_3_6/lib/python3.6/site-packages/arviz/stats/stats.py:487: FutureWarning: hdi currently interprets 2d data as (draw, shape) but this will change in a future release to (chain, draw) for coherence with other functions FutureWarning,
dataX.head()
sex | height | weight | shoe | tv | soda | bmi | w_score | w_score2 | |
---|---|---|---|---|---|---|---|---|---|
0 | M | 71.0 | 162.0 | 10.0 | 35.0 | 7.0 | 22.591946 | -0.060349 | 0.003642 |
1 | M | 67.0 | 140.0 | 11.0 | 7.0 | 28.0 | 21.924705 | -0.551975 | 0.304676 |
2 | M | 75.0 | 177.0 | 12.0 | 3.0 | 3.0 | 22.121067 | 0.274850 | 0.075542 |
3 | M | 72.0 | 210.0 | 11.0 | 11.0 | 14.0 | 28.478009 | 1.012288 | 1.024726 |
4 | M | 72.0 | 180.0 | 11.0 | 20.0 | 15.0 | 24.409722 | 0.341889 | 0.116888 |
plt.figure(figsize=(13,6))
plt.scatter(dataX[dataX.sex=='M'].w_score,dataX[dataX.sex=='M'].bmi, alpha=0.24, label='Males')
plt.scatter(dataX[dataX.sex=='F'].w_score,dataX[dataX.sex=='F'].bmi, alpha=0.24,label='Females')
plt.legend()
plt.show()
plt.figure(figsize=(13,6))
plt.scatter(dataX[dataX.sex=='M'].w_score2,dataX[dataX.sex=='M'].bmi, alpha=0.24, label='Males')
plt.scatter(dataX[dataX.sex=='F'].w_score2,dataX[dataX.sex=='F'].bmi, alpha=0.24,label='Females')
plt.xlim(0,5)
plt.legend()
plt.show()
dataX.head()
sex | height | weight | shoe | tv | soda | bmi | w_score | w_score2 | |
---|---|---|---|---|---|---|---|---|---|
0 | M | 71.0 | 162.0 | 10.0 | 35.0 | 7.0 | 22.591946 | -0.060349 | 0.003642 |
1 | M | 67.0 | 140.0 | 11.0 | 7.0 | 28.0 | 21.924705 | -0.551975 | 0.304676 |
2 | M | 75.0 | 177.0 | 12.0 | 3.0 | 3.0 | 22.121067 | 0.274850 | 0.075542 |
3 | M | 72.0 | 210.0 | 11.0 | 11.0 | 14.0 | 28.478009 | 1.012288 | 1.024726 |
4 | M | 72.0 | 180.0 | 11.0 | 20.0 | 15.0 | 24.409722 | 0.341889 | 0.116888 |
dataX['Mnum'] = dataX.sex.apply(lambda x: 1 if x=="M" else 0)
dataX.head(7)
sex | height | weight | shoe | tv | soda | bmi | w_score | w_score2 | Mnum | |
---|---|---|---|---|---|---|---|---|---|---|
0 | M | 71.0 | 162.0 | 10.0 | 35.0 | 7.0 | 22.591946 | -0.060349 | 0.003642 | 1 |
1 | M | 67.0 | 140.0 | 11.0 | 7.0 | 28.0 | 21.924705 | -0.551975 | 0.304676 | 1 |
2 | M | 75.0 | 177.0 | 12.0 | 3.0 | 3.0 | 22.121067 | 0.274850 | 0.075542 | 1 |
3 | M | 72.0 | 210.0 | 11.0 | 11.0 | 14.0 | 28.478009 | 1.012288 | 1.024726 | 1 |
4 | M | 72.0 | 180.0 | 11.0 | 20.0 | 15.0 | 24.409722 | 0.341889 | 0.116888 | 1 |
5 | F | 69.0 | 136.0 | 9.0 | 63.0 | 21.0 | 20.081495 | -0.641361 | 0.411344 | 0 |
6 | F | 70.0 | 162.0 | 10.0 | 1.0 | 0.0 | 23.242041 | -0.060349 | 0.003642 | 0 |
with pm.Model() as modelX:
pm.glm.GLM.from_formula('Mnum ~ height + w_score', dataX, family=pm.glm.families.Binomial())
traceX = pm.sample(2000, tune = 1000, init='adapt_diag')
Auto-assigning NUTS sampler... Initializing NUTS using adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [w_score, height, Intercept] Sampling 4 chains, 0 divergences: 100%|██████████| 12000/12000 [03:17<00:00, 60.84draws/s] The number of effective samples is smaller than 25% for some parameters.
az.plot_trace(traceX, figsize=(26,12))
print(az.summary(traceX))
/opt/anaconda3/envs/pymc3_3_6/lib/python3.6/site-packages/arviz/data/io_pymc3.py:91: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context. FutureWarning, /opt/anaconda3/envs/pymc3_3_6/lib/python3.6/site-packages/arviz/data/io_pymc3.py:91: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context. FutureWarning,
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_mean \ Intercept -25.022 1.617 -28.124 -22.087 0.038 0.027 1855.0 height 0.371 0.024 0.326 0.415 0.001 0.000 1858.0 w_score 1.086 0.108 0.890 1.293 0.002 0.001 2894.0 ess_sd ess_bulk ess_tail r_hat Intercept 1855.0 1850.0 2299.0 1.0 height 1858.0 1853.0 2355.0 1.0 w_score 2870.0 2896.0 2985.0 1.0
az.plot_kde(traceX['height'],traceX['w_score'], figsize=(13,6))
<AxesSubplot:>
sns.jointplot(traceX['height'],traceX['w_score'], kind='hex')
/opt/anaconda3/envs/pymc3_3_6/lib/python3.6/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning
<seaborn.axisgrid.JointGrid at 0x7f9c9227e550>
traceX.height.mean()
0.37101288563621493
bob = traceX.height.mean()*70+traceX.w_score.mean()*2+traceX.Intercept.mean()
np.exp(bob)/(1+np.exp(bob))
0.9577086144859366
def predict_sex(w,h):
wZ = (w-dataX.weight.mean())/dataX.weight.std()
v = traceX.height.mean()*h+traceX.w_score.mean()*wZ+traceX.Intercept.mean()
return np.exp(v)/(1+np.exp(v))
dataX['prediction'] = dataX.apply(lambda x: predict_sex(x.weight,x.height), axis =1)
xs = np.linspace(dataX.w_score.min(),dataX.w_score.max(), 200)
ys = np.linspace(dataX.height.min(),dataX.height.max(), 200)
X,Y = np.meshgrid(xs,ys)
Z = np.exp(traceX.height.mean()*Y+traceX.w_score.mean()*X+traceX.Intercept.mean())/(1+np.exp(traceX.height.mean()*Y+traceX.w_score.mean()*X+traceX.Intercept.mean()))
plt.figure(figsize=(13,6))
plt.contour(X,Y,Z, levels=[0.5])
plt.scatter(dataX[dataX.sex=='M'].w_score,dataX[dataX.sex=='M'].height, alpha=0.24, label='Males')
plt.scatter(dataX[dataX.sex=='F'].w_score,dataX[dataX.sex=='F'].height, alpha=0.24,label='Females')
plt.xlim(-1.5,2)
plt.legend()
plt.show()
ps = predict_sex(dataX.weight,dataX.height)
errs = ((ps<0.5)&dataX.Mnum)|((ps>=0.5)&(1-dataX.Mnum))
plt.figure(figsize=(13,6))
plt.contour(X,Y,Z, levels=[0.5])
plt.scatter(dataX.w_score[errs],dataX.height[errs], facecolors='red',s=120, alpha=0.4)
plt.scatter(dataX[dataX.sex=='M'].w_score,dataX[dataX.sex=='M'].height, alpha=0.24, label='Males')
plt.scatter(dataX[dataX.sex=='F'].w_score,dataX[dataX.sex=='F'].height, alpha=0.24,label='Females')
#plt.xlim(-1.5,2)
plt.legend()
plt.show()
errs.sum()/len(errs)
0.15151515151515152
predict_sex(200,72)
0.9273834767034302