import pandas as pd # Manipulating dataframes, boolean logic
import numpy as np # numerical play stuff
import matplotlib.pyplot as plt # plotting functions
import seaborn as sns # prettier plotting
#code to make stuff appear
%matplotlib inline
%config InlineBackend.figure_format='retina' #Sharp graphs, higher resolution
dataX = pd.read_csv('data/MAT110Survey/110Statdata.csv')
dataX.head(10)
Q | M | M .1 | M .2 | M .3 | M .4 | F | F.1 | F.2 | F.3 | F.4 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 71.0 | 67.0 | 75 | 72.0 | 72.0 | 69.0 | 70.0 | 67 | 61 | 67 |
1 | 2 | 162.0 | 140.0 | 177 | 210.0 | 180.0 | 136.0 | 162.0 | 152 | 135 | 150 |
2 | 3 | 10.0 | 11.0 | 12 | 11.0 | 11.0 | 9.0 | 10.0 | 8 | 8 | 7 |
3 | 4 | 35.0 | 7.0 | 3 | 11.0 | 20.0 | 63.0 | 1.0 | 5 | 9 | 8 |
4 | 5 | 7.0 | 28.0 | 3 | 14.0 | 15.0 | 21.0 | 0.0 | 3 | 7 | 3 |
5 | 1 | 70.0 | 70.0 | 72 | 71.0 | 71.0 | 68.0 | 65.0 | 62 | 64 | 64 |
6 | 2 | 140.0 | 175.0 | 190 | 240.0 | 280.0 | 124.0 | 140.0 | 110 | 147 | 125 |
7 | 3 | 10.0 | 8.0 | 11 | 11.0 | 11.0 | 7.0 | 9.0 | 7.5 | 8 | 6.5 |
8 | 4 | 14.0 | 2.0 | 2 | 6.0 | 10.0 | 4.0 | 5.0 | 1 | 10 | 20 |
9 | 5 | 5.0 | 0.0 | 20 | 12.0 | 10.0 | 10.0 | 2.0 | 1 | 28 | 3 |
dataX.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 825 entries, 0 to 824 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Q 825 non-null int64 1 M 825 non-null float64 2 M .1 825 non-null float64 3 M .2 825 non-null object 4 M .3 825 non-null float64 5 M .4 825 non-null float64 6 F 825 non-null float64 7 F.1 825 non-null float64 8 F.2 825 non-null object 9 F.3 825 non-null object 10 F.4 824 non-null object dtypes: float64(6), int64(1), object(4) memory usage: 71.0+ KB
Horrid
non-useable format, so first job is to recreate this.
columns be the questions and also the sex, the rows could be individuals
len(dataX)
825
dataX.transpose()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 815 | 816 | 817 | 818 | 819 | 820 | 821 | 822 | 823 | 824 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Q | 1 | 2 | 3 | 4 | 5 | 1 | 2 | 3 | 4 | 5 | ... | 1 | 2 | 3 | 4 | 5 | 1 | 2 | 3 | 4 | 5 |
M | 71 | 162 | 10 | 35 | 7 | 70 | 140 | 10 | 14 | 5 | ... | 72 | 172 | 12 | 5 | 0 | 71 | 149 | 12 | 15 | 5 |
M .1 | 67 | 140 | 11 | 7 | 28 | 70 | 175 | 8 | 2 | 0 | ... | 69 | 180 | 12.5 | 3 | 3 | 70 | 164 | 11.5 | 30 | 11 |
M .2 | 75 | 177 | 12 | 3 | 3 | 72 | 190 | 11 | 2 | 20 | ... | 74 | 176 | 10 | 7 | 5 | 71 | 127 | 10 | 25 | 0 |
M .3 | 72 | 210 | 11 | 11 | 14 | 71 | 240 | 11 | 6 | 12 | ... | 67 | 192 | 11 | 8 | 4 | 72 | 213 | 12 | 0 | 0 |
M .4 | 72 | 180 | 11 | 20 | 15 | 71 | 280 | 11 | 10 | 10 | ... | 68 | 201 | 13 | 6 | 6 | 68 | 180 | 11 | 30 | 20 |
F | 69 | 136 | 9 | 63 | 21 | 68 | 124 | 7 | 4 | 10 | ... | 62 | 154 | 8.5 | 6 | 0 | 71 | 148 | 6 | 15 | 1 |
F.1 | 70 | 162 | 10 | 1 | 0 | 65 | 140 | 9 | 5 | 2 | ... | 64 | 132 | 9 | 8 | 1 | 65 | 110 | 4 | 5 | 0 |
F.2 | 67 | 152 | 8 | 5 | 3 | 62 | 110 | 7.5 | 1 | 1 | ... | 70 | 128 | 9 | 3 | 2 | 64 | 116 | 5 | 5 | 0 |
F.3 | 61 | 135 | 8 | 9 | 7 | 64 | 147 | 8 | 10 | 28 | ... | 63 | 167 | 7.5 | 2 | 5 | 70 | 134 | 8 | 15 | 0 |
F.4 | 67 | 150 | 7 | 8 | 3 | 64 | 125 | 6.5 | 20 | 3 | ... | 62 | 133 | 7 | 5 | 0 | 73 | 185 | 7 | 30 | 5 |
11 rows × 825 columns
dataX[0:5].transpose().reset_index()
index | 0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|---|
0 | Q | 1 | 2 | 3 | 4 | 5 |
1 | M | 71 | 162 | 10 | 35 | 7 |
2 | M .1 | 67 | 140 | 11 | 7 | 28 |
3 | M .2 | 75 | 177 | 12 | 3 | 3 |
4 | M .3 | 72 | 210 | 11 | 11 | 14 |
5 | M .4 | 72 | 180 | 11 | 20 | 15 |
6 | F | 69 | 136 | 9 | 63 | 21 |
7 | F.1 | 70 | 162 | 10 | 1 | 0 |
8 | F.2 | 67 | 152 | 8 | 5 | 3 |
9 | F.3 | 61 | 135 | 8 | 9 | 7 |
10 | F.4 | 67 | 150 | 7 | 8 | 3 |
dataX[5:10].transpose().reset_index()
index | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|
0 | Q | 1 | 2 | 3 | 4 | 5 |
1 | M | 70 | 140 | 10 | 14 | 5 |
2 | M .1 | 70 | 175 | 8 | 2 | 0 |
3 | M .2 | 72 | 190 | 11 | 2 | 20 |
4 | M .3 | 71 | 240 | 11 | 6 | 12 |
5 | M .4 | 71 | 280 | 11 | 10 | 10 |
6 | F | 68 | 124 | 7 | 4 | 10 |
7 | F.1 | 65 | 140 | 9 | 5 | 2 |
8 | F.2 | 62 | 110 | 7.5 | 1 | 1 |
9 | F.3 | 64 | 147 | 8 | 10 | 28 |
10 | F.4 | 64 | 125 | 6.5 | 20 | 3 |
np.concatenate((dataX[0:5].transpose().reset_index()[1:11].values,dataX[5:10].transpose().reset_index()[1:11].values))
array([['M ', 71.0, 162.0, 10.0, 35.0, 7.0], ['M .1', 67.0, 140.0, 11.0, 7.0, 28.0], ['M .2', '75', '177', '12', '3', '3'], ['M .3', 72.0, 210.0, 11.0, 11.0, 14.0], ['M .4', 72.0, 180.0, 11.0, 20.0, 15.0], ['F', 69.0, 136.0, 9.0, 63.0, 21.0], ['F.1', 70.0, 162.0, 10.0, 1.0, 0.0], ['F.2', '67', '152', '8', '5', '3'], ['F.3', '61', '135', '8', '9', '7'], ['F.4', '67', '150', '7', '8', '3'], ['M ', 70.0, 140.0, 10.0, 14.0, 5.0], ['M .1', 70.0, 175.0, 8.0, 2.0, 0.0], ['M .2', '72', '190', '11', '2', '20'], ['M .3', 71.0, 240.0, 11.0, 6.0, 12.0], ['M .4', 71.0, 280.0, 11.0, 10.0, 10.0], ['F', 68.0, 124.0, 7.0, 4.0, 10.0], ['F.1', 65.0, 140.0, 9.0, 5.0, 2.0], ['F.2', '62', '110', '7.5', '1', '1'], ['F.3', '64', '147', '8', '10', '28'], ['F.4', '64', '125', '6.5', '20', '3']], dtype=object)
len(dataX)//5
165
New_set=[['Sex','Height','Weight','Shoe','tv','Soda']]
for c in range(len(dataX)//5):
New_set=np.concatenate((New_set,(dataX[0+5*c:5+5*c].transpose().reset_index()[1:11].values)))
dataY=pd.DataFrame(New_set)
dataY.columns=dataY.iloc[0]
dataY.head()
Sex | Height | Weight | Shoe | tv | Soda | |
---|---|---|---|---|---|---|
0 | Sex | Height | Weight | Shoe | tv | Soda |
1 | M | 71 | 162 | 10 | 35 | 7 |
2 | M .1 | 67 | 140 | 11 | 7 | 28 |
3 | M .2 | 75 | 177 | 12 | 3 | 3 |
4 | M .3 | 72 | 210 | 11 | 11 | 14 |
dataY.iloc[3]["tv"]
'3'
dataY.tv[11]
14.0
dataY=dataY[1:]
dataY.Sex=dataY.Sex.str[0]
dataY.tail()
Sex | Height | Weight | Shoe | tv | Soda | |
---|---|---|---|---|---|---|
1646 | F | 71 | 148 | 6 | 15 | 1 |
1647 | F | 65 | 110 | 4 | 5 | 0 |
1648 | F | 64 | 116 | 5 | 5 | 0 |
1649 | F | 70 | 134 | 8 | 15 | 0 |
1650 | F | 73 | 185 | 7 | 30 | 5 |
dataY=dataY.reset_index(drop=True)
dataY.head()
Sex | Height | Weight | Shoe | tv | Soda | |
---|---|---|---|---|---|---|
0 | M | 71 | 162 | 10 | 35 | 7 |
1 | M | 67 | 140 | 11 | 7 | 28 |
2 | M | 75 | 177 | 12 | 3 | 3 |
3 | M | 72 | 210 | 11 | 11 | 14 |
4 | M | 72 | 180 | 11 | 20 | 15 |
dataY.tail()
Sex | Height | Weight | Shoe | tv | Soda | |
---|---|---|---|---|---|---|
1645 | F | 71 | 148 | 6 | 15 | 1 |
1646 | F | 65 | 110 | 4 | 5 | 0 |
1647 | F | 64 | 116 | 5 | 5 | 0 |
1648 | F | 70 | 134 | 8 | 15 | 0 |
1649 | F | 73 | 185 | 7 | 30 | 5 |
dataY.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1650 entries, 0 to 1649 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Sex 1650 non-null object 1 Height 1650 non-null object 2 Weight 1650 non-null object 3 Shoe 1650 non-null object 4 tv 1649 non-null object 5 Soda 1650 non-null object dtypes: object(6) memory usage: 77.5+ KB
dataY.Height.str.isnumeric()
0 NaN 1 NaN 2 True 3 NaN 4 NaN ... 1645 NaN 1646 NaN 1647 True 1648 True 1649 True Name: Height, Length: 1650, dtype: object
dataY.Height = dataY.Height.astype(float)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-25-1fe343f08823> in <module> ----> 1 dataY.Height = dataY.Height.astype(float) /opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors) 5544 else: 5545 # else, only a single dtype is given -> 5546 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,) 5547 return self._constructor(new_data).__finalize__(self, method="astype") 5548 /opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors) 593 self, dtype, copy: bool = False, errors: str = "raise" 594 ) -> "BlockManager": --> 595 return self.apply("astype", dtype=dtype, copy=copy, errors=errors) 596 597 def convert( /opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, **kwargs) 404 applied = b.apply(f, **kwargs) 405 else: --> 406 applied = getattr(b, f)(**kwargs) 407 result_blocks = _extend_blocks(applied, result_blocks) 408 /opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors) 593 vals1d = values.ravel() 594 try: --> 595 values = astype_nansafe(vals1d, dtype, copy=True) 596 except (ValueError, TypeError): 597 # e.g. astype_nansafe can fail on object-dtype of strings /opt/anaconda3/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna) 993 if copy or is_object_dtype(arr) or is_object_dtype(dtype): 994 # Explicit copy, or required since NumPy can't view from / to object. --> 995 return arr.astype(dtype, copy=True) 996 997 return arr.view(dtype) ValueError: could not convert string to float: '70 3/4'
dataY[dataY.Height=="70 3/4"]
Sex | Height | Weight | Shoe | tv | Soda | |
---|---|---|---|---|---|---|
1437 | F | 70 3/4 | 170 | 10 1/2 | 8 | 5 |
dataY[1427:1447]
Sex | Height | Weight | Shoe | tv | Soda | BMI | |
---|---|---|---|---|---|---|---|
1427 | F | 74.00 | 180.0 | 11 | 6 | 2 | 23.108108 |
1428 | F | 68.00 | 140.0 | 9 | 7 | 1 | 21.284602 |
1429 | F | 65.00 | 203.0 | 7.5 | 14 | 4 | 33.777278 |
1430 | M | 68.00 | 139.0 | 11 | 0 | 0 | 21.132569 |
1431 | M | 72.00 | 155.0 | 12 | 1 | 1 | 21.019483 |
1432 | M | 70.00 | 130.0 | 8 1/2 | 10 | 5 | 18.651020 |
1433 | M | 72.00 | 220.0 | 15 | 96 | 0 | 29.834105 |
1434 | M | 73.00 | 220.0 | 13 | 4 | 10 | 29.022331 |
1435 | F | 61.00 | 125.0 | 8 | 10 | 0 | 23.615963 |
1436 | F | 60.00 | 120.0 | 6 | 3 | 3 | 23.433333 |
1437 | F | 70.75 | 170.0 | 10 1/2 | 8 | 5 | 23.875439 |
1438 | F | 62.00 | 120.0 | 5.5 | 5 | 0 | 21.945890 |
1439 | F | 63.00 | 132.0 | 7 | 5 | 6 | 23.380197 |
1440 | M | 72.00 | 250.0 | 11 | 35 | 2 | 33.902392 |
1441 | M | 76.00 | 300.0 | 12 | 50 | 14 | 36.513158 |
1442 | M | 73.00 | 200.0 | 11 | 0 | 0 | 26.383937 |
1443 | M | 78.00 | 196.0 | 10 | 5 | 1 | 22.647600 |
1444 | M | 72.00 | 208.0 | 13 | 20 | 6 | 28.206790 |
1445 | F | 65.00 | 200.0 | 9 | 35 | 25 | 33.278107 |
1446 | F | 62.00 | 165.0 | 10 | 2 | 4 | 30.175598 |
dataY.Height[1437]=70.75
dataY.Shoe[1437]=10.5
<ipython-input-45-26a6fd170f63>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy dataY.Shoe[1437]=10.5
dataY.Height = dataY.Height.astype(float)
dataY.Weight = dataY.Weight.astype(float)
dataY.Shoe = dataY.Shoe.astype(float)
dataY.tv = dataY.tv.astype(float)
dataY.Soda = dataY.Soda.astype(float)
dataY[dataY.Soda==" "]
Sex | Height | Weight | Shoe | tv | Soda | BMI |
---|
dataY.Soda[1279]=0
<ipython-input-48-406c15af3e6b>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy dataY.Soda[1279]=0
dataY =pd.DataFrame(dataY)
dataY.Height = dataY.Height.astype(float)
dataY.Weight = dataY.Weight.astype(float)
dataY.Shoe = dataY.Shoe.astype(float)
dataY.tv = dataY.tv.astype(float)
dataY.Soda = dataY.Soda.astype(float)
dataY.tail()
Sex | Height | Weight | Shoe | tv | Soda | BMI | |
---|---|---|---|---|---|---|---|
1645 | F | 71.0 | 148.0 | 6.0 | 15.0 | 1.0 | 20.639556 |
1646 | F | 65.0 | 110.0 | 4.0 | 5.0 | 0.0 | 18.302959 |
1647 | F | 64.0 | 116.0 | 5.0 | 5.0 | 0.0 | 19.909180 |
1648 | F | 70.0 | 134.0 | 8.0 | 15.0 | 0.0 | 19.224898 |
1649 | F | 73.0 | 185.0 | 7.0 | 30.0 | 5.0 | 24.405142 |
len(dataY)*5
8250
plt.hist(dataY.Soda)
(array([927., 386., 167., 92., 40., 21., 6., 1., 7., 3.]), array([ 0., 6., 12., 18., 24., 30., 36., 42., 48., 54., 60.]), <BarContainer object of 10 artists>)
dataY['BMI'] = dataY.Weight/(dataY.Height)**2*703
plt.hist(dataY.BMI)
(array([124., 966., 431., 103., 21., 4., 0., 0., 0., 1.]), array([11.9005848 , 19.03005294, 26.15952109, 33.28898924, 40.41845739, 47.54792553, 54.67739368, 61.80686183, 68.93632998, 76.06579812, 83.19526627]), <BarContainer object of 10 artists>)
dataY.to_csv('data/MAT110Survey/110Survey_fixed.csv')
dataY.to_json('data/MAT110Survey/110Survey_fixed.json')