import pandas as pd # Manipulating dataframes, boolean logic
import numpy as np # numerical play stuff
import matplotlib.pyplot as plt # plotting functions
import seaborn as sns # prettier plotting
#code to make stuff appear
%matplotlib inline
%config InlineBackend.figure_format='retina' #Sharp graphs, higher resolution
dataX = pd.read_csv('data/MAT110Survey/110Statdata.csv')
dataX.head(10)
Q | M | M .1 | M .2 | M .3 | M .4 | F | F.1 | F.2 | F.3 | F.4 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 71.0 | 67.0 | 75 | 72.0 | 72.0 | 69.0 | 70.0 | 67 | 61 | 67 |
1 | 2 | 162.0 | 140.0 | 177 | 210.0 | 180.0 | 136.0 | 162.0 | 152 | 135 | 150 |
2 | 3 | 10.0 | 11.0 | 12 | 11.0 | 11.0 | 9.0 | 10.0 | 8 | 8 | 7 |
3 | 4 | 35.0 | 7.0 | 3 | 11.0 | 20.0 | 63.0 | 1.0 | 5 | 9 | 8 |
4 | 5 | 7.0 | 28.0 | 3 | 14.0 | 15.0 | 21.0 | 0.0 | 3 | 7 | 3 |
5 | 1 | 70.0 | 70.0 | 72 | 71.0 | 71.0 | 68.0 | 65.0 | 62 | 64 | 64 |
6 | 2 | 140.0 | 175.0 | 190 | 240.0 | 280.0 | 124.0 | 140.0 | 110 | 147 | 125 |
7 | 3 | 10.0 | 8.0 | 11 | 11.0 | 11.0 | 7.0 | 9.0 | 7.5 | 8 | 6.5 |
8 | 4 | 14.0 | 2.0 | 2 | 6.0 | 10.0 | 4.0 | 5.0 | 1 | 10 | 20 |
9 | 5 | 5.0 | 0.0 | 20 | 12.0 | 10.0 | 10.0 | 2.0 | 1 | 28 | 3 |
dataX.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 825 entries, 0 to 824 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Q 825 non-null int64 1 M 825 non-null float64 2 M .1 825 non-null float64 3 M .2 825 non-null object 4 M .3 825 non-null float64 5 M .4 825 non-null float64 6 F 825 non-null float64 7 F.1 825 non-null float64 8 F.2 825 non-null object 9 F.3 825 non-null object 10 F.4 824 non-null object dtypes: float64(6), int64(1), object(4) memory usage: 71.0+ KB
Horrid
non-useable format, so first job is to recreate this.
columns be the questions and also the sex, the rows could be individuals
len(dataX)
825
dataX.transpose()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 815 | 816 | 817 | 818 | 819 | 820 | 821 | 822 | 823 | 824 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Q | 1 | 2 | 3 | 4 | 5 | 1 | 2 | 3 | 4 | 5 | ... | 1 | 2 | 3 | 4 | 5 | 1 | 2 | 3 | 4 | 5 |
M | 71 | 162 | 10 | 35 | 7 | 70 | 140 | 10 | 14 | 5 | ... | 72 | 172 | 12 | 5 | 0 | 71 | 149 | 12 | 15 | 5 |
M .1 | 67 | 140 | 11 | 7 | 28 | 70 | 175 | 8 | 2 | 0 | ... | 69 | 180 | 12.5 | 3 | 3 | 70 | 164 | 11.5 | 30 | 11 |
M .2 | 75 | 177 | 12 | 3 | 3 | 72 | 190 | 11 | 2 | 20 | ... | 74 | 176 | 10 | 7 | 5 | 71 | 127 | 10 | 25 | 0 |
M .3 | 72 | 210 | 11 | 11 | 14 | 71 | 240 | 11 | 6 | 12 | ... | 67 | 192 | 11 | 8 | 4 | 72 | 213 | 12 | 0 | 0 |
M .4 | 72 | 180 | 11 | 20 | 15 | 71 | 280 | 11 | 10 | 10 | ... | 68 | 201 | 13 | 6 | 6 | 68 | 180 | 11 | 30 | 20 |
F | 69 | 136 | 9 | 63 | 21 | 68 | 124 | 7 | 4 | 10 | ... | 62 | 154 | 8.5 | 6 | 0 | 71 | 148 | 6 | 15 | 1 |
F.1 | 70 | 162 | 10 | 1 | 0 | 65 | 140 | 9 | 5 | 2 | ... | 64 | 132 | 9 | 8 | 1 | 65 | 110 | 4 | 5 | 0 |
F.2 | 67 | 152 | 8 | 5 | 3 | 62 | 110 | 7.5 | 1 | 1 | ... | 70 | 128 | 9 | 3 | 2 | 64 | 116 | 5 | 5 | 0 |
F.3 | 61 | 135 | 8 | 9 | 7 | 64 | 147 | 8 | 10 | 28 | ... | 63 | 167 | 7.5 | 2 | 5 | 70 | 134 | 8 | 15 | 0 |
F.4 | 67 | 150 | 7 | 8 | 3 | 64 | 125 | 6.5 | 20 | 3 | ... | 62 | 133 | 7 | 5 | 0 | 73 | 185 | 7 | 30 | 5 |
11 rows × 825 columns
dataX[0:5].transpose().reset_index()
index | 0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|---|
0 | Q | 1 | 2 | 3 | 4 | 5 |
1 | M | 71 | 162 | 10 | 35 | 7 |
2 | M .1 | 67 | 140 | 11 | 7 | 28 |
3 | M .2 | 75 | 177 | 12 | 3 | 3 |
4 | M .3 | 72 | 210 | 11 | 11 | 14 |
5 | M .4 | 72 | 180 | 11 | 20 | 15 |
6 | F | 69 | 136 | 9 | 63 | 21 |
7 | F.1 | 70 | 162 | 10 | 1 | 0 |
8 | F.2 | 67 | 152 | 8 | 5 | 3 |
9 | F.3 | 61 | 135 | 8 | 9 | 7 |
10 | F.4 | 67 | 150 | 7 | 8 | 3 |
dataX[5:10].transpose().reset_index()
index | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|
0 | Q | 1 | 2 | 3 | 4 | 5 |
1 | M | 70 | 140 | 10 | 14 | 5 |
2 | M .1 | 70 | 175 | 8 | 2 | 0 |
3 | M .2 | 72 | 190 | 11 | 2 | 20 |
4 | M .3 | 71 | 240 | 11 | 6 | 12 |
5 | M .4 | 71 | 280 | 11 | 10 | 10 |
6 | F | 68 | 124 | 7 | 4 | 10 |
7 | F.1 | 65 | 140 | 9 | 5 | 2 |
8 | F.2 | 62 | 110 | 7.5 | 1 | 1 |
9 | F.3 | 64 | 147 | 8 | 10 | 28 |
10 | F.4 | 64 | 125 | 6.5 | 20 | 3 |
np.concatenate((dataX[0:5].transpose().reset_index()[1:11].values,dataX[5:10].transpose().reset_index()[1:11].values))
array([['M ', 71.0, 162.0, 10.0, 35.0, 7.0], ['M .1', 67.0, 140.0, 11.0, 7.0, 28.0], ['M .2', '75', '177', '12', '3', '3'], ['M .3', 72.0, 210.0, 11.0, 11.0, 14.0], ['M .4', 72.0, 180.0, 11.0, 20.0, 15.0], ['F', 69.0, 136.0, 9.0, 63.0, 21.0], ['F.1', 70.0, 162.0, 10.0, 1.0, 0.0], ['F.2', '67', '152', '8', '5', '3'], ['F.3', '61', '135', '8', '9', '7'], ['F.4', '67', '150', '7', '8', '3'], ['M ', 70.0, 140.0, 10.0, 14.0, 5.0], ['M .1', 70.0, 175.0, 8.0, 2.0, 0.0], ['M .2', '72', '190', '11', '2', '20'], ['M .3', 71.0, 240.0, 11.0, 6.0, 12.0], ['M .4', 71.0, 280.0, 11.0, 10.0, 10.0], ['F', 68.0, 124.0, 7.0, 4.0, 10.0], ['F.1', 65.0, 140.0, 9.0, 5.0, 2.0], ['F.2', '62', '110', '7.5', '1', '1'], ['F.3', '64', '147', '8', '10', '28'], ['F.4', '64', '125', '6.5', '20', '3']], dtype=object)
len(dataX)//5
165
New_set=[['Sex','Height','Weight','Shoe','tv','Soda']]
for c in range(len(dataX)//5):
New_set=np.concatenate((New_set,(dataX[0+5*c:5+5*c].transpose().reset_index()[1:11].values)))
dataY=pd.DataFrame(New_set)
dataY.columns=dataY.iloc[0]
dataY.head()
Sex | Height | Weight | Shoe | tv | Soda | |
---|---|---|---|---|---|---|
0 | Sex | Height | Weight | Shoe | tv | Soda |
1 | M | 71 | 162 | 10 | 35 | 7 |
2 | M .1 | 67 | 140 | 11 | 7 | 28 |
3 | M .2 | 75 | 177 | 12 | 3 | 3 |
4 | M .3 | 72 | 210 | 11 | 11 | 14 |
dataY.iloc[3]["tv"]
'3'
dataY.tv[11]
14.0
dataY=dataY[1:]
dataY.Sex=dataY.Sex.str[0]
dataY.tail()
Sex | Height | Weight | Shoe | tv | Soda | |
---|---|---|---|---|---|---|
1646 | F | 71 | 148 | 6 | 15 | 1 |
1647 | F | 65 | 110 | 4 | 5 | 0 |
1648 | F | 64 | 116 | 5 | 5 | 0 |
1649 | F | 70 | 134 | 8 | 15 | 0 |
1650 | F | 73 | 185 | 7 | 30 | 5 |
dataY=dataY.reset_index(drop=True)
dataY.head()
Sex | Height | Weight | Shoe | tv | Soda | |
---|---|---|---|---|---|---|
0 | M | 71 | 162 | 10 | 35 | 7 |
1 | M | 67 | 140 | 11 | 7 | 28 |
2 | M | 75 | 177 | 12 | 3 | 3 |
3 | M | 72 | 210 | 11 | 11 | 14 |
4 | M | 72 | 180 | 11 | 20 | 15 |
dataY.tail()
Sex | Height | Weight | Shoe | tv | Soda | |
---|---|---|---|---|---|---|
1645 | F | 71 | 148 | 6 | 15 | 1 |
1646 | F | 65 | 110 | 4 | 5 | 0 |
1647 | F | 64 | 116 | 5 | 5 | 0 |
1648 | F | 70 | 134 | 8 | 15 | 0 |
1649 | F | 73 | 185 | 7 | 30 | 5 |
dataY.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1650 entries, 0 to 1649 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Sex 1650 non-null object 1 Height 1650 non-null object 2 Weight 1650 non-null object 3 Shoe 1650 non-null object 4 tv 1649 non-null object 5 Soda 1650 non-null object dtypes: object(6) memory usage: 77.5+ KB
dataY.Height.str.isnumeric()
0 NaN 1 NaN 2 True 3 NaN 4 NaN ... 1645 NaN 1646 NaN 1647 True 1648 True 1649 True Name: Height, Length: 1650, dtype: object
dataY.Height = dataY.Height.astype(float)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-44-1fe343f08823> in <module> ----> 1 dataY.Height = dataY.Height.astype(float) /opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors) 5544 else: 5545 # else, only a single dtype is given -> 5546 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,) 5547 return self._constructor(new_data).__finalize__(self, method="astype") 5548 /opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors) 593 self, dtype, copy: bool = False, errors: str = "raise" 594 ) -> "BlockManager": --> 595 return self.apply("astype", dtype=dtype, copy=copy, errors=errors) 596 597 def convert( /opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, **kwargs) 404 applied = b.apply(f, **kwargs) 405 else: --> 406 applied = getattr(b, f)(**kwargs) 407 result_blocks = _extend_blocks(applied, result_blocks) 408 /opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors) 593 vals1d = values.ravel() 594 try: --> 595 values = astype_nansafe(vals1d, dtype, copy=True) 596 except (ValueError, TypeError): 597 # e.g. astype_nansafe can fail on object-dtype of strings /opt/anaconda3/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna) 993 if copy or is_object_dtype(arr) or is_object_dtype(dtype): 994 # Explicit copy, or required since NumPy can't view from / to object. --> 995 return arr.astype(dtype, copy=True) 996 997 return arr.view(dtype) ValueError: could not convert string to float: '70 3/4'
dataY[dataY.Height=="70 3/4"]
Sex | Height | Weight | Shoe | tv | Soda | |
---|---|---|---|---|---|---|
1437 | F | 70 3/4 | 170 | 10 1/2 | 8 | 5 |
dataY[1427:1447]
Sex | Height | Weight | Shoe | tv | Soda | |
---|---|---|---|---|---|---|
1427 | F | 74 | 180 | 11 | 6 | 2 |
1428 | F | 68 | 140 | 9 | 7 | 1 |
1429 | F | 65 | 203 | 7.5 | 14 | 4 |
1430 | M | 68 | 139 | 11 | 0 | 0 |
1431 | M | 72 | 155 | 12 | 1 | 1 |
1432 | M | 70 | 130 | 8 1/2 | 10 | 5 |
1433 | M | 72 | 220 | 15 | 96 | 0 |
1434 | M | 73 | 220 | 13 | 4 | 10 |
1435 | F | 61 | 125 | 8 | 10 | 0 |
1436 | F | 60 | 120 | 6 | 3 | 3 |
1437 | F | 70 3/4 | 170 | 10 1/2 | 8 | 5 |
1438 | F | 62 | 120 | 5 1/2 | 5 | 0 |
1439 | F | 63 | 132 | 7 | 5 | 6 |
1440 | M | 72 | 250 | 11 | 35 | 2 |
1441 | M | 76 | 300 | 12 | 50 | 14 |
1442 | M | 73 | 200 | 11 | 0 | 0 |
1443 | M | 78 | 196 | 10 | 5 | 1 |
1444 | M | 72 | 208 | 13 | 20 | 6 |
1445 | F | 65 | 200 | 9 | 35 | 25 |
1446 | F | 62 | 165 | 10 | 2 | 4 |
dataY.Height[1437]=70.75
dataY.Shoe[1438]=5.5
dataY.Height = dataY.Height.astype(float)
dataY.Weight = dataY.Weight.astype(float)
dataY.Shoe = dataY.Shoe.astype(float)
dataY.tv = dataY.tv.astype(float)
dataY.Soda = dataY.Soda.astype(float)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-51-9da1037f56ae> in <module> 3 dataY.Shoe = dataY.Shoe.astype(float) 4 dataY.tv = dataY.tv.astype(float) ----> 5 dataY.Soda = dataY.Soda.astype(float) /opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors) 5544 else: 5545 # else, only a single dtype is given -> 5546 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,) 5547 return self._constructor(new_data).__finalize__(self, method="astype") 5548 /opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors) 593 self, dtype, copy: bool = False, errors: str = "raise" 594 ) -> "BlockManager": --> 595 return self.apply("astype", dtype=dtype, copy=copy, errors=errors) 596 597 def convert( /opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, **kwargs) 404 applied = b.apply(f, **kwargs) 405 else: --> 406 applied = getattr(b, f)(**kwargs) 407 result_blocks = _extend_blocks(applied, result_blocks) 408 /opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors) 593 vals1d = values.ravel() 594 try: --> 595 values = astype_nansafe(vals1d, dtype, copy=True) 596 except (ValueError, TypeError): 597 # e.g. astype_nansafe can fail on object-dtype of strings /opt/anaconda3/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna) 993 if copy or is_object_dtype(arr) or is_object_dtype(dtype): 994 # Explicit copy, or required since NumPy can't view from / to object. --> 995 return arr.astype(dtype, copy=True) 996 997 return arr.view(dtype) ValueError: could not convert string to float: ''
dataY[dataY.Soda==" "]
Sex | Height | Weight | Shoe | tv | Soda | |
---|---|---|---|---|---|---|
1279 | F | 65.0 | 178.0 | 8.0 | 12.0 |
dataY.Soda[1279]=0
<ipython-input-53-406c15af3e6b>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy dataY.Soda[1279]=0
dataY =pd.DataFrame(dataY)
dataY.Height = dataY.Height.astype(float)
dataY.Weight = dataY.Weight.astype(float)
dataY.Shoe = dataY.Shoe.astype(float)
dataY.tv = dataY.tv.astype(float)
dataY.Soda = dataY.Soda.astype(float)
dataY.tail()
Sex | Height | Weight | Shoe | tv | Soda | |
---|---|---|---|---|---|---|
1645 | F | 71.0 | 148.0 | 6.0 | 15.0 | 1.0 |
1646 | F | 65.0 | 110.0 | 4.0 | 5.0 | 0.0 |
1647 | F | 64.0 | 116.0 | 5.0 | 5.0 | 0.0 |
1648 | F | 70.0 | 134.0 | 8.0 | 15.0 | 0.0 |
1649 | F | 73.0 | 185.0 | 7.0 | 30.0 | 5.0 |
len(dataY)*5
8250
plt.hist(dataY.Soda)
(array([927., 386., 167., 92., 40., 21., 6., 1., 7., 3.]), array([ 0., 6., 12., 18., 24., 30., 36., 42., 48., 54., 60.]), <BarContainer object of 10 artists>)
dataY['BMI'] = dataY.Weight/(dataY.Height)**2*703
plt.hist(dataY.BMI)
(array([124., 966., 431., 103., 21., 4., 0., 0., 0., 1.]), array([11.9005848 , 19.03005294, 26.15952109, 33.28898924, 40.41845739, 47.54792553, 54.67739368, 61.80686183, 68.93632998, 76.06579812, 83.19526627]), <BarContainer object of 10 artists>)
dataY.to_csv('data/MAT110Survey/110Survey_fixed.csv')
dataY.to_json('data/MAT110Survey/110Survey_fixed.json')
import pandas as pd # Manipulating dataframes, boolean logic
import numpy as np # numerical play stuff
import matplotlib.pyplot as plt # plotting functions
import seaborn as sns # prettier plotting
import statsmodels.api as sm
#code to make stuff appear
%matplotlib inline
%config InlineBackend.figure_format='retina' #Sharp graphs, higher resolution
dataX = pd.read_csv('data/MAT110Survey/110Survey_fixed.csv', index_col=0)
dataX.head()
Sex | Height | Weight | Shoe | tv | Soda | BMI | |
---|---|---|---|---|---|---|---|
0 | M | 71.0 | 162.0 | 10.0 | 35.0 | 7.0 | 22.591946 |
1 | M | 67.0 | 140.0 | 11.0 | 7.0 | 28.0 | 21.924705 |
2 | M | 75.0 | 177.0 | 12.0 | 3.0 | 3.0 | 22.121067 |
3 | M | 72.0 | 210.0 | 11.0 | 11.0 | 14.0 | 28.478009 |
4 | M | 72.0 | 180.0 | 11.0 | 20.0 | 15.0 | 24.409722 |
dataX.columns = ['Sex','Height','Weight','Shoe','TV','Soda', 'BMI']
dataX.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1650 entries, 0 to 1649 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Sex 1650 non-null object 1 Height 1650 non-null float64 2 Weight 1650 non-null float64 3 Shoe 1650 non-null float64 4 TV 1649 non-null float64 5 Soda 1650 non-null float64 6 BMI 1650 non-null float64 dtypes: float64(6), object(1) memory usage: 103.1+ KB
dataX.describe()
Height | Weight | Shoe | TV | Soda | BMI | |
---|---|---|---|---|---|---|
count | 1650.000000 | 1650.000000 | 1650.000000 | 1649.000000 | 1650.000000 | 1650.000000 |
mean | 67.603891 | 164.700606 | 9.466303 | 11.660400 | 7.055455 | 25.144393 |
std | 5.059790 | 44.749529 | 2.432196 | 9.693915 | 8.041798 | 5.480159 |
min | 40.000000 | 41.000000 | 3.000000 | 0.000000 | 0.000000 | 11.900585 |
25% | 64.000000 | 130.000000 | 8.000000 | 5.000000 | 1.000000 | 21.410418 |
50% | 68.000000 | 156.000000 | 9.500000 | 10.000000 | 5.000000 | 24.126627 |
75% | 71.000000 | 189.750000 | 11.000000 | 15.000000 | 10.000000 | 27.703134 |
max | 96.000000 | 500.000000 | 17.000000 | 96.000000 | 60.000000 | 83.195266 |
bob = plt.hist(dataX.Height,bins=10)
sns.boxplot(x='Height', data = dataX)
plt.show()
sns.boxplot(x='Height', data = dataX[(dataX.BMI>=25)&~(dataX.Sex=="F")])
<AxesSubplot:xlabel='Height'>
fred = sns.histplot(dataX.Height)
dataX[(dataX.BMI>=25)&~(dataX.Sex=="F")].head(20)
Sex | Height | Weight | Shoe | TV | Soda | BMI | |
---|---|---|---|---|---|---|---|
3 | M | 72.0 | 210.0 | 11.0 | 11.0 | 14.0 | 28.478009 |
11 | M | 70.0 | 175.0 | 8.0 | 2.0 | 0.0 | 25.107143 |
12 | M | 72.0 | 190.0 | 11.0 | 2.0 | 20.0 | 25.765818 |
13 | M | 71.0 | 240.0 | 11.0 | 6.0 | 12.0 | 33.469550 |
14 | M | 71.0 | 280.0 | 11.0 | 10.0 | 10.0 | 39.047808 |
20 | M | 69.0 | 190.0 | 11.0 | 6.0 | 0.0 | 28.055030 |
21 | M | 70.0 | 175.0 | 12.0 | 20.0 | 48.0 | 25.107143 |
22 | M | 76.0 | 335.0 | 13.0 | 7.0 | 20.0 | 40.773026 |
23 | M | 69.0 | 225.0 | 11.0 | 6.0 | 10.0 | 33.223062 |
30 | M | 72.0 | 310.0 | 13.0 | 26.0 | 20.0 | 42.038966 |
40 | M | 72.0 | 275.0 | 12.0 | 6.0 | 2.0 | 37.292631 |
41 | M | 71.0 | 200.0 | 13.0 | 8.0 | 10.0 | 27.891291 |
43 | M | 71.0 | 235.0 | 12.0 | 50.0 | 10.0 | 32.772267 |
44 | M | 69.0 | 179.0 | 11.0 | 37.0 | 22.0 | 26.430792 |
53 | M | 69.0 | 175.0 | 11.0 | 9.0 | 3.0 | 25.840160 |
62 | M | 66.0 | 170.0 | 11.5 | 7.0 | 20.0 | 27.435721 |
64 | M | 68.0 | 180.0 | 12.5 | 15.0 | 5.0 | 27.365917 |
70 | M | 74.0 | 195.0 | 10.0 | 5.0 | 0.0 | 25.033784 |
72 | M | 74.0 | 252.0 | 12.0 | 4.0 | 15.0 | 32.351351 |
74 | M | 75.0 | 216.0 | 15.0 | 7.0 | 5.0 | 26.995200 |
sns.displot(data=dataX.Height, kind='kde')
<seaborn.axisgrid.FacetGrid at 0x7ff258ad5df0>
sns.displot(data=dataX[dataX.Sex=='M'].Height, kind='kde')
plt.show()
sns.displot(data=dataX[dataX.Sex=='F'].Height, kind='kde')
<seaborn.axisgrid.FacetGrid at 0x7ff271b9c2e0>
plt.figure(figsize=(13,6))
sns.distplot(dataX[dataX.Sex=="M"].Height)
sns.distplot(dataX[dataX.Sex=="F"].Height)
plt.show()
import warnings
warnings.filterwarnings('ignore')
plt.figure(figsize=(13,6))
plt.scatter(dataX.Height,dataX.Weight)
plt.xlabel('Height')
plt.ylabel('Weight')
plt.show()
sns.displot(dataX.BMI)
<seaborn.axisgrid.FacetGrid at 0x7ff278d33670>
sns.distplot(dataX[dataX.Sex=='M'].BMI )
sns.distplot(dataX[dataX.Sex=='F'].BMI)
<AxesSubplot:xlabel='BMI', ylabel='Density'>
plt.figure(figsize=(13,6))
sns.scatterplot(data=dataX, x='Height', y = 'Weight', hue='Sex', alpha =1, x_jitter=5)
<AxesSubplot:xlabel='Height', ylabel='Weight'>
plt.figure(figsize=(13,10))
sns.jointplot(data=dataX, x='Height', y = 'Weight', hue='Sex')
<seaborn.axisgrid.JointGrid at 0x7ff2798ecc10>
<Figure size 936x720 with 0 Axes>
sns.pairplot(dataX[dataX.Sex=="M"][['Height','Weight','Shoe','BMI']])
sns.pairplot(dataX[dataX.Sex=="F"][['Height','Weight','Shoe','BMI']])
<seaborn.axisgrid.PairGrid at 0x7ff2696e3cd0>
plt.figure(figsize=(13,8))
plt.scatter(dataX[dataX.Sex=="M"].Height,dataX[dataX.Sex=="M"].Weight, c=dataX[dataX.Sex=='M'].BMI, cmap='plasma', alpha =0.5)
cbar = plt.colorbar()
BMI_bins = [0,18.5,25,30,np.inf]
BMI_names = ['Underweight',"Normal",'Overweight','Obese']
dataX['BMI_levels']= pd.cut(dataY.BMI,BMI_bins,labels=BMI_names)
dataX.head(20)
Sex | Height | Weight | Shoe | TV | Soda | BMI | BMI_levls | BMI_levels | |
---|---|---|---|---|---|---|---|---|---|
0 | M | 71.0 | 162.0 | 10.0 | 35.0 | 7.0 | 22.591946 | Normal | Normal |
1 | M | 67.0 | 140.0 | 11.0 | 7.0 | 28.0 | 21.924705 | Normal | Normal |
2 | M | 75.0 | 177.0 | 12.0 | 3.0 | 3.0 | 22.121067 | Normal | Normal |
3 | M | 72.0 | 210.0 | 11.0 | 11.0 | 14.0 | 28.478009 | Overweight | Overweight |
4 | M | 72.0 | 180.0 | 11.0 | 20.0 | 15.0 | 24.409722 | Normal | Normal |
5 | F | 69.0 | 136.0 | 9.0 | 63.0 | 21.0 | 20.081495 | Normal | Normal |
6 | F | 70.0 | 162.0 | 10.0 | 1.0 | 0.0 | 23.242041 | Normal | Normal |
7 | F | 67.0 | 152.0 | 8.0 | 5.0 | 3.0 | 23.803965 | Normal | Normal |
8 | F | 61.0 | 135.0 | 8.0 | 9.0 | 7.0 | 25.505241 | Overweight | Overweight |
9 | F | 67.0 | 150.0 | 7.0 | 8.0 | 3.0 | 23.490755 | Normal | Normal |
10 | M | 70.0 | 140.0 | 10.0 | 14.0 | 5.0 | 20.085714 | Normal | Normal |
11 | M | 70.0 | 175.0 | 8.0 | 2.0 | 0.0 | 25.107143 | Overweight | Overweight |
12 | M | 72.0 | 190.0 | 11.0 | 2.0 | 20.0 | 25.765818 | Overweight | Overweight |
13 | M | 71.0 | 240.0 | 11.0 | 6.0 | 12.0 | 33.469550 | Obese | Obese |
14 | M | 71.0 | 280.0 | 11.0 | 10.0 | 10.0 | 39.047808 | Obese | Obese |
15 | F | 68.0 | 124.0 | 7.0 | 4.0 | 10.0 | 18.852076 | Normal | Normal |
16 | F | 65.0 | 140.0 | 9.0 | 5.0 | 2.0 | 23.294675 | Normal | Normal |
17 | F | 62.0 | 110.0 | 7.5 | 1.0 | 1.0 | 20.117066 | Normal | Normal |
18 | F | 64.0 | 147.0 | 8.0 | 10.0 | 28.0 | 25.229736 | Overweight | Overweight |
19 | F | 64.0 | 125.0 | 6.5 | 20.0 | 3.0 | 21.453857 | Normal | Normal |
dataX.BMI.max()
83.19526627218934
plt.figure(figsize=(13,6))
sns.scatterplot(x='Height', y='Weight', data=dataX, hue = "BMI_levels")
<AxesSubplot:xlabel='Height', ylabel='Weight'>
levels = BMI_names
import matplotlib.colors as mcolors
import matplotlib.colors as mcolors
def plot_colortable(colors, title, sort_colors=True, emptycols=0):
cell_width = 212
cell_height = 22
swatch_width = 48
margin = 12
topmargin = 40
# Sort colors by hue, saturation, value and name.
if sort_colors is True:
by_hsv = sorted((tuple(mcolors.rgb_to_hsv(mcolors.to_rgb(color))),
name)
for name, color in colors.items())
names = [name for hsv, name in by_hsv]
else:
names = list(colors)
n = len(names)
ncols = 4 - emptycols
nrows = n // ncols + int(n % ncols > 0)
width = cell_width * 4 + 2 * margin
height = cell_height * nrows + margin + topmargin
dpi = 72
fig, ax = plt.subplots(figsize=(width / dpi, height / dpi), dpi=dpi)
fig.subplots_adjust(margin/width, margin/height,
(width-margin)/width, (height-topmargin)/height)
ax.set_xlim(0, cell_width * 4)
ax.set_ylim(cell_height * (nrows-0.5), -cell_height/2.)
ax.yaxis.set_visible(False)
ax.xaxis.set_visible(False)
ax.set_axis_off()
ax.set_title(title, fontsize=24, loc="left", pad=10)
for i, name in enumerate(names):
row = i % nrows
col = i // nrows
y = row * cell_height
swatch_start_x = cell_width * col
swatch_end_x = cell_width * col + swatch_width
text_pos_x = cell_width * col + swatch_width + 7
ax.text(text_pos_x, y, name, fontsize=14,
horizontalalignment='left',
verticalalignment='center')
ax.hlines(y, swatch_start_x, swatch_end_x,
color=colors[name], linewidth=18)
return fig
plot_colortable(mcolors.BASE_COLORS, "Base Colors",
sort_colors=False, emptycols=1)
plot_colortable(mcolors.TABLEAU_COLORS, "Tableau Palette",
sort_colors=False, emptycols=2)
#sphinx_gallery_thumbnail_number = 3
plot_colortable(mcolors.CSS4_COLORS, "CSS Colors")
# Optionally plot the XKCD colors (Caution: will produce large figure)
#xkcd_fig = plot_colortable(mcolors.XKCD_COLORS, "XKCD Colors")
#xkcd_fig.savefig("XKCD_Colors.png")
plt.show()
by_hsv = sorted((tuple(mcolors.rgb_to_hsv(mcolors.to_rgb(color))),name) for name, color in mcolors.CSS4_COLORS.items())
names = [name for hsv, name in by_hsv]
names
['black', 'dimgray', 'dimgrey', 'gray', 'grey', 'darkgray', 'darkgrey', 'silver', 'lightgray', 'lightgrey', 'gainsboro', 'whitesmoke', 'white', 'snow', 'rosybrown', 'lightcoral', 'indianred', 'brown', 'firebrick', 'maroon', 'darkred', 'red', 'mistyrose', 'salmon', 'tomato', 'darksalmon', 'coral', 'orangered', 'lightsalmon', 'sienna', 'seashell', 'chocolate', 'saddlebrown', 'sandybrown', 'peachpuff', 'peru', 'linen', 'bisque', 'darkorange', 'burlywood', 'antiquewhite', 'tan', 'navajowhite', 'blanchedalmond', 'papayawhip', 'moccasin', 'orange', 'wheat', 'oldlace', 'floralwhite', 'darkgoldenrod', 'goldenrod', 'cornsilk', 'gold', 'lemonchiffon', 'khaki', 'palegoldenrod', 'darkkhaki', 'ivory', 'beige', 'lightyellow', 'lightgoldenrodyellow', 'olive', 'yellow', 'olivedrab', 'yellowgreen', 'darkolivegreen', 'greenyellow', 'chartreuse', 'lawngreen', 'honeydew', 'darkseagreen', 'palegreen', 'lightgreen', 'forestgreen', 'limegreen', 'darkgreen', 'green', 'lime', 'seagreen', 'mediumseagreen', 'springgreen', 'mintcream', 'mediumspringgreen', 'mediumaquamarine', 'aquamarine', 'turquoise', 'lightseagreen', 'mediumturquoise', 'azure', 'lightcyan', 'paleturquoise', 'darkslategray', 'darkslategrey', 'teal', 'darkcyan', 'aqua', 'cyan', 'darkturquoise', 'cadetblue', 'powderblue', 'lightblue', 'deepskyblue', 'skyblue', 'lightskyblue', 'steelblue', 'aliceblue', 'dodgerblue', 'lightslategray', 'lightslategrey', 'slategray', 'slategrey', 'lightsteelblue', 'cornflowerblue', 'royalblue', 'ghostwhite', 'lavender', 'midnightblue', 'navy', 'darkblue', 'mediumblue', 'blue', 'slateblue', 'darkslateblue', 'mediumslateblue', 'mediumpurple', 'rebeccapurple', 'blueviolet', 'indigo', 'darkorchid', 'darkviolet', 'mediumorchid', 'thistle', 'plum', 'violet', 'purple', 'darkmagenta', 'fuchsia', 'magenta', 'orchid', 'mediumvioletred', 'deeppink', 'hotpink', 'lavenderblush', 'palevioletred', 'crimson', 'pink', 'lightpink']
names[::6]
['black', 'darkgrey', 'white', 'firebrick', 'tomato', 'seashell', 'linen', 'navajowhite', 'oldlace', 'lemonchiffon', 'lightyellow', 'darkolivegreen', 'palegreen', 'lime', 'mediumaquamarine', 'lightcyan', 'aqua', 'deepskyblue', 'lightslategray', 'royalblue', 'mediumblue', 'rebeccapurple', 'thistle', 'magenta', 'palevioletred']
Color_map={}
new_names = names[10::8]
for x in levels:
print(x, new_names[levels.index(x)])
Color_map.update({x:new_names[levels.index(x)]})
Underweight gainsboro Normal firebrick Overweight coral Obese peachpuff
Color_map["Underweight"]
'gainsboro'
plt.figure(figsize=(13,5))
plt.scatter(dataX[dataX.Sex=="M"].Height,dataX[dataX.Sex=="M"].Weight, c=dataX[dataX.Sex=='M'].BMI_levels.apply(lambda x: Color_map[x]), alpha =0.5)
plt.show()
plt.figure(figsize=(13,5))
plt.scatter(dataX[dataX.Sex=="F"].Height,dataX[dataX.Sex=="F"].Weight, c=dataX[dataX.Sex=='F'].BMI_levels.apply(lambda x: Color_map[x]), alpha =0.5)
plt.show()
plt.figure(figsize=(13,5))
sns.scatterplot(y='Shoe',x='BMI',data=dataX[dataX.BMI<=50], hue = 'Sex')
<AxesSubplot:xlabel='BMI', ylabel='Shoe'>
model = sm.OLS(dataX.BMI,dataX.Shoe).fit()
model.summary()
Dep. Variable: | BMI | R-squared (uncentered): | 0.929 |
---|---|---|---|
Model: | OLS | Adj. R-squared (uncentered): | 0.929 |
Method: | Least Squares | F-statistic: | 2.158e+04 |
Date: | Wed, 27 Jan 2021 | Prob (F-statistic): | 0.00 |
Time: | 13:47:46 | Log-Likelihood: | -5517.9 |
No. Observations: | 1650 | AIC: | 1.104e+04 |
Df Residuals: | 1649 | BIC: | 1.104e+04 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Shoe | 2.5379 | 0.017 | 146.893 | 0.000 | 2.504 | 2.572 |
Omnibus: | 347.780 | Durbin-Watson: | 1.399 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 2262.980 |
Skew: | 0.820 | Prob(JB): | 0.00 |
Kurtosis: | 8.498 | Cond. No. | 1.00 |
import pandas as pd # Manipulating dataframes, boolean logic
import numpy as np # numerical play stuff
import matplotlib.pyplot as plt # plotting functions
import seaborn as sns # prettier plotting
import statsmodels.api as sm
#code to make stuff appear
%matplotlib inline
%config InlineBackend.figure_format='retina' #Sharp graphs, higher resolution
dataX = pd.read_csv('data/MAT110Survey/110Survey_fixed.csv', index_col=0)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-6-24efddeb8487> in <module> ----> 1 sns.heatmap(data=dataX[dataX.Sex=="M"]) /opt/anaconda3/lib/python3.8/site-packages/seaborn/_decorators.py in inner_f(*args, **kwargs) 44 ) 45 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)}) ---> 46 return f(**kwargs) 47 return inner_f 48 /opt/anaconda3/lib/python3.8/site-packages/seaborn/matrix.py in heatmap(data, vmin, vmax, cmap, center, robust, annot, fmt, annot_kws, linewidths, linecolor, cbar, cbar_kws, cbar_ax, square, xticklabels, yticklabels, mask, ax, **kwargs) 543 """ 544 # Initialize the plotter object --> 545 plotter = _HeatMapper(data, vmin, vmax, cmap, center, robust, annot, fmt, 546 annot_kws, cbar, cbar_kws, xticklabels, 547 yticklabels, mask) /opt/anaconda3/lib/python3.8/site-packages/seaborn/matrix.py in __init__(self, data, vmin, vmax, cmap, center, robust, annot, fmt, annot_kws, cbar, cbar_kws, xticklabels, yticklabels, mask) 163 164 # Determine good default values for the colormapping --> 165 self._determine_cmap_params(plot_data, vmin, vmax, 166 cmap, center, robust) 167 /opt/anaconda3/lib/python3.8/site-packages/seaborn/matrix.py in _determine_cmap_params(self, plot_data, vmin, vmax, cmap, center, robust) 197 198 # plot_data is a np.ma.array instance --> 199 calc_data = plot_data.astype(float).filled(np.nan) 200 if vmin is None: 201 if robust: ValueError: could not convert string to float: 'M'
np.random.random()
0.47147260589615314
np.random.random(3000000)
array([0.63108745, 0.03054646, 0.94150723, ..., 0.42674499, 0.74178852, 0.61746057])