make sure you are using the pymc3 envoirnment
import pymc3 as pm
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy import stats
import pandas as pd
import arviz as az
pm.__version__
'3.8'
We can use the data that we have already looked at, the survey that I gave in the MAT110 Class
dataX = pd.read_csv('data/MAT110Survey/110Survey_fixed.csv', index_col=0)
dataX.head(10)
Sex | Height | Weight | Shoe | tv | Soda | BMI | |
---|---|---|---|---|---|---|---|
0 | M | 71.0 | 162.0 | 10.0 | 35.0 | 7.0 | 22.591946 |
1 | M | 67.0 | 140.0 | 11.0 | 7.0 | 28.0 | 21.924705 |
2 | M | 75.0 | 177.0 | 12.0 | 3.0 | 3.0 | 22.121067 |
3 | M | 72.0 | 210.0 | 11.0 | 11.0 | 14.0 | 28.478009 |
4 | M | 72.0 | 180.0 | 11.0 | 20.0 | 15.0 | 24.409722 |
5 | F | 69.0 | 136.0 | 9.0 | 63.0 | 21.0 | 20.081495 |
6 | F | 70.0 | 162.0 | 10.0 | 1.0 | 0.0 | 23.242041 |
7 | F | 67.0 | 152.0 | 8.0 | 5.0 | 3.0 | 23.803965 |
8 | F | 61.0 | 135.0 | 8.0 | 9.0 | 7.0 | 25.505241 |
9 | F | 67.0 | 150.0 | 7.0 | 8.0 | 3.0 | 23.490755 |
dataX=pd.read_csv('data/110Statsfixed.csv', index_col=0)
dataX.columns = ['height','weight','shoe','tv','soda','sex']
dataX.head()
height | weight | shoe | tv | soda | sex | |
---|---|---|---|---|---|---|
0 | 71.0 | 162.0 | 10.0 | 35.0 | 7.0 | M |
1 | 69.0 | 136.0 | 9.0 | 63.0 | 21.0 | W |
2 | 67.0 | 140.0 | 11.0 | 7.0 | 28.0 | M |
3 | 70.0 | 162.0 | 10.0 | 1.0 | 0.0 | W |
4 | 75.0 | 177.0 | 12.0 | 3.0 | 3.0 | M |
plt.figure(figsize=(13,6))
data = dataX[dataX.sex=="M"]
plt.plot(data.weight,data.height,'.',alpha=0.3)
plt.show()
dataX.describe()
height | weight | shoe | tv | soda | |
---|---|---|---|---|---|
count | 1650.000000 | 1650.000000 | 1650.000000 | 1650.000000 | 1650.000000 |
mean | 67.603891 | 164.700606 | 9.466303 | 11.653333 | 7.055455 |
std | 5.059790 | 44.749529 | 2.432196 | 9.695225 | 8.041798 |
min | 40.000000 | 41.000000 | 3.000000 | 0.000000 | 0.000000 |
25% | 64.000000 | 130.000000 | 8.000000 | 5.000000 | 1.000000 |
50% | 68.000000 | 156.000000 | 9.500000 | 10.000000 | 5.000000 |
75% | 71.000000 | 189.750000 | 11.000000 | 15.000000 | 10.000000 |
max | 96.000000 | 500.000000 | 17.000000 | 96.000000 | 60.000000 |
sns.displot(dataX[dataX.sex=="M"].height)
<seaborn.axisgrid.FacetGrid at 0x7fd539a342e8>
data = dataX[dataX.height.isin(range(60,80))]
sns.displot(data[data.sex=="M"].height)
sns.displot(data[data.sex=="M"].weight)
<seaborn.axisgrid.FacetGrid at 0x7fd539311358>
plt.figure(figsize=(13,6))
#data = dataX[dataX.sex=="M"]
plt.plot(data.weight,data.height,'.',alpha=0.5)
plt.show()
data = dataX[(dataX.height.isin(range(60,80)))&(dataX.weight<=325)&(dataX.sex=="M")]
plt.figure(figsize=(13,6))
#data = dataX[dataX.sex=="M"]
plt.plot(data.weight,data.height,'.',alpha=0.5)
plt.show()
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)
with pm.Model() as males:
pm.glm.GLM.from_formula('weight ~ height',data = data)
trace = pm.sample(1000, tune = 500)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [sd, height, Intercept] Sampling 4 chains, 0 divergences: 100%|██████████| 6000/6000 [01:06<00:00, 89.75draws/s] The acceptance probability does not match the target. It is 0.8847865393494039, but should be close to 0.8. Try to increase the number of tuning steps. The acceptance probability does not match the target. It is 0.9460953834355507, but should be close to 0.8. Try to increase the number of tuning steps. The acceptance probability does not match the target. It is 0.9985603667566193, but should be close to 0.8. Try to increase the number of tuning steps. The chain reached the maximum tree depth. Increase max_treedepth, increase target_accept or reparameterize. The rhat statistic is larger than 1.4 for some parameters. The sampler did not converge. The estimated number of effective samples is smaller than 200 for some parameters.
az.plot_trace(trace, figsize=(26,18))
plt.show()
Sex = "M"
accepted_height_data = dataX[(dataX.height>=dataX.height.quantile(0.05))&(dataX.height<=dataX.height.quantile(0.95))].height.unique()
accepted_weight_data = dataX[(dataX.weight>=dataX.weight.quantile(0.05))&(dataX.weight<=dataX.weight.quantile(0.95))].weight.unique()
data = dataX[(dataX.height.isin(accepted_height_data))&(dataX.weight.isin(accepted_weight_data))&(dataX.sex=="M")]
plt.figure(figsize=(13,6))
plt.plot(data.weight,data.height,'.',alpha=0.5)
plt.show()
dataM = dict(x=data.weight,y=data.height)
with pm.Model() as males:
pm.glm.GLM.from_formula('y ~ x',dataM)
trace = pm.sample(1000,chains=3)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (3 chains in 4 jobs) NUTS: [sd, x, Intercept] Sampling 3 chains, 71 divergences: 100%|██████████| 4500/4500 [00:14<00:00, 304.16draws/s] The acceptance probability does not match the target. It is 0.9868823014107456, but should be close to 0.8. Try to increase the number of tuning steps. There were 71 divergences after tuning. Increase `target_accept` or reparameterize. The acceptance probability does not match the target. It is 0.25164202199799596, but should be close to 0.8. Try to increase the number of tuning steps. The acceptance probability does not match the target. It is 0.9941000327499986, but should be close to 0.8. Try to increase the number of tuning steps. The rhat statistic is larger than 1.05 for some parameters. This indicates slight problems during sampling. The estimated number of effective samples is smaller than 200 for some parameters.
az.plot_trace(trace,figsize=(26,18))
plt.show()
az.summary(trace)
mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_mean | ess_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|---|---|
Intercept | 64.374 | 0.677 | 63.169 | 65.661 | 0.043 | 0.030 | 253.0 | 252.0 | 254.0 | 285.0 | 1.01 |
x | 0.032 | 0.004 | 0.025 | 0.039 | 0.000 | 0.000 | 251.0 | 251.0 | 260.0 | 247.0 | 1.01 |
sd | 2.838 | 0.076 | 2.700 | 2.971 | 0.010 | 0.007 | 58.0 | 58.0 | 59.0 | 134.0 | 1.05 |
plt.figure(figsize=(13,6))
plt.scatter(data.weight,data.height, alpha = 0.4, label = 'points')
plt.legend()
plt.show()
df_trace = pm.trace_to_dataframe(trace)
df_trace.head()
Intercept | x | sd | |
---|---|---|---|
0 | 63.708910 | 0.035199 | 2.649113 |
1 | 63.787917 | 0.035953 | 2.938869 |
2 | 64.559063 | 0.031254 | 2.728511 |
3 | 64.821427 | 0.029917 | 2.875835 |
4 | 65.881462 | 0.024160 | 2.781681 |
sns.jointplot(x = "x", y = "Intercept",data=df_trace)
<seaborn.axisgrid.JointGrid at 0x7fd54384b518>
samples = np.random.choice(len(df_trace),50,replace=False)
plt.figure(figsize=(13,6))
plt.scatter(data.weight,data.height, alpha = 0.4, label = 'points')
for c in samples:
plt.plot(data.weight,df_trace.iloc[c].Intercept+df_trace.iloc[c].x*data.weight,alpha =0.13,c = 'black')
plt.plot(data.weight,df_trace.Intercept.mean()+df_trace.x.mean()*data.weight,c = 'r',label='prediction')
plt.legend()
plt.show()
Sex = "M"
accepted_height_data = dataX[(dataX.height>=dataX.height.quantile(0.05))&(dataX.height<=dataX.height.quantile(0.95))].height.unique()
accepted_weight_data = dataX[(dataX.weight>=dataX.weight.quantile(0.05))&(dataX.weight<=dataX.weight.quantile(0.95))].weight.unique()
data = dataX[(dataX.height.isin(accepted_height_data))&(dataX.weight.isin(accepted_weight_data))&(dataX.sex=="M")]
with pm.Model() as Males:
slope = pm.Normal('slope', mu = 0, sd =4 )
intercept = pm.Normal('intercept', mu = 60, sd = 10)
heights_est = slope*data.weight + intercept
heights_obs = pm.Normal('height', mu = heights_est, sd = data.height.std(), observed = data.height)
trace = pm.sample(1000)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [intercept, slope] Sampling 4 chains, 0 divergences: 100%|██████████| 6000/6000 [00:03<00:00, 1894.13draws/s] The acceptance probability does not match the target. It is 0.8944038831705001, but should be close to 0.8. Try to increase the number of tuning steps. The number of effective samples is smaller than 25% for some parameters.
az.plot_trace(trace,figsize=(26,18))
plt.show()
az.summary(trace)
mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_mean | ess_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|---|---|
slope | 0.033 | 0.004 | 0.025 | 0.040 | 0.000 | 0.000 | 3605.0 | 3605.0 | 3602.0 | 4433.0 | 1.0 |
intercept | 64.308 | 0.704 | 63.034 | 65.645 | 0.012 | 0.008 | 3617.0 | 3615.0 | 3613.0 | 4466.0 | 1.0 |
df_trace = pm.trace_to_dataframe(trace)
samples = np.random.choice(len(df_trace),50,replace=False)
plt.figure(figsize=(13,6))
plt.scatter(data.weight,data.height, alpha = 0.4, label = 'points')
for c in samples:
plt.plot(data.weight,df_trace.iloc[c].intercept+df_trace.iloc[c].slope*data.weight,alpha =0.13,c = 'black')
plt.plot(data.weight,df_trace.intercept.mean()+df_trace.slope.mean()*data.weight,c = 'r',label='prediction')
plt.legend()
plt.show()
Sex = "M"
accepted_height_data = dataX[(dataX.height>=dataX.height.quantile(0.05))&(dataX.height<=dataX.height.quantile(0.95))].height.unique()
accepted_weight_data = dataX[(dataX.weight>=dataX.weight.quantile(0.05))&(dataX.weight<=dataX.weight.quantile(0.95))].weight.unique()
data = pd.DataFrame(dataX[(dataX.height.isin(accepted_height_data))&(dataX.weight.isin(accepted_weight_data))&(dataX.sex=="M")])
with pm.Model() as Males:
slope = pm.Normal('slope', mu = 0, sd =4 )
intercept = pm.Normal('intercept', mu = 60, sd = 10)
heights_est = slope*data.weight + intercept
heights_obs = pm.Normal('height', mu = heights_est, sd = data.height.std(), observed = data.height)
trace = pm.sample(1000, tune=750)
az.plot_trace(trace,figsize=(26,18))
plt.show()
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [intercept, slope] Sampling 4 chains, 0 divergences: 100%|██████████| 7000/7000 [00:03<00:00, 1880.07draws/s] The acceptance probability does not match the target. It is 0.9085373343888846, but should be close to 0.8. Try to increase the number of tuning steps. The number of effective samples is smaller than 25% for some parameters.
with pm.Model() as Males:
a = pm.Normal('a', mu = 0, sd =4 )
b = pm.Normal('b', mu = 60, sd = 10)
a2 = pm.Normal('a2', mu = 0, sd =4 )
heights_est = a*data.weight + b + a2*(data.weight-data.weight.mean())
heights_obs = pm.Normal('height', mu = heights_est, sd = data.height.std(), observed = data.height)
trace = pm.sample(1000)
az.plot_trace(trace,figsize=(26,18))
plt.show()
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [a2, b, a] Sampling 4 chains, 0 divergences: 100%|██████████| 6000/6000 [00:15<00:00, 376.72draws/s] The acceptance probability does not match the target. It is 0.8939386710178403, but should be close to 0.8. Try to increase the number of tuning steps. The number of effective samples is smaller than 10% for some parameters.