import pandas as pd  # Manipulating dataframes, boolean logic
import numpy as np  # numerical play stuff
import matplotlib.pyplot as plt  # plotting functions
import seaborn as sns  # prettier plotting

#code to make stuff appear
%matplotlib inline 
%config InlineBackend.figure_format='retina' #Sharp graphs, higher resolution


dataX = pd.read_csv('data/MAT110Survey/110Statdata.csv')


dataX.head(10)


dataX.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 825 entries, 0 to 824
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Q       825 non-null    int64  
 1   M       825 non-null    float64
 2   M .1    825 non-null    float64
 3   M .2    825 non-null    object 
 4   M .3    825 non-null    float64
 5   M .4    825 non-null    float64
 6   F       825 non-null    float64
 7   F.1     825 non-null    float64
 8   F.2     825 non-null    object 
 9   F.3     825 non-null    object 
 10  F.4     824 non-null    object 
dtypes: float64(6), int64(1), object(4)
memory usage: 71.0+ KB


len(dataX)

825


dataX.transpose()


dataX[0:5].transpose().reset_index()


dataX[5:10].transpose().reset_index()


np.concatenate((dataX[0:5].transpose().reset_index()[1:11].values,dataX[5:10].transpose().reset_index()[1:11].values))

array([['M ', 71.0, 162.0, 10.0, 35.0, 7.0],
       ['M .1', 67.0, 140.0, 11.0, 7.0, 28.0],
       ['M .2', '75', '177', '12', '3', '3'],
       ['M .3', 72.0, 210.0, 11.0, 11.0, 14.0],
       ['M .4', 72.0, 180.0, 11.0, 20.0, 15.0],
       ['F', 69.0, 136.0, 9.0, 63.0, 21.0],
       ['F.1', 70.0, 162.0, 10.0, 1.0, 0.0],
       ['F.2', '67', '152', '8', '5', '3'],
       ['F.3', '61', '135', '8', '9', '7'],
       ['F.4', '67', '150', '7', '8', '3'],
       ['M ', 70.0, 140.0, 10.0, 14.0, 5.0],
       ['M .1', 70.0, 175.0, 8.0, 2.0, 0.0],
       ['M .2', '72', '190', '11', '2', '20'],
       ['M .3', 71.0, 240.0, 11.0, 6.0, 12.0],
       ['M .4', 71.0, 280.0, 11.0, 10.0, 10.0],
       ['F', 68.0, 124.0, 7.0, 4.0, 10.0],
       ['F.1', 65.0, 140.0, 9.0, 5.0, 2.0],
       ['F.2', '62', '110', '7.5', '1', '1'],
       ['F.3', '64', '147', '8', '10', '28'],
       ['F.4', '64', '125', '6.5', '20', '3']], dtype=object)


len(dataX)//5

165


New_set=[['Sex','Height','Weight','Shoe','tv','Soda']]
for c in range(len(dataX)//5):
    New_set=np.concatenate((New_set,(dataX[0+5*c:5+5*c].transpose().reset_index()[1:11].values)))


dataY=pd.DataFrame(New_set)


dataY.columns=dataY.iloc[0]


dataY.head()


dataY.iloc[3]["tv"]

'3'


dataY.tv[11]

14.0


dataY=dataY[1:]


dataY.Sex=dataY.Sex.str[0]


dataY.tail()


dataY=dataY.reset_index(drop=True)


dataY.head()


dataY.tail()


dataY.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1650 entries, 0 to 1649
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Sex     1650 non-null   object
 1   Height  1650 non-null   object
 2   Weight  1650 non-null   object
 3   Shoe    1650 non-null   object
 4   tv      1649 non-null   object
 5   Soda    1650 non-null   object
dtypes: object(6)
memory usage: 77.5+ KB


dataY.Height.str.isnumeric()

0        NaN
1        NaN
2       True
3        NaN
4        NaN
        ... 
1645     NaN
1646     NaN
1647    True
1648    True
1649    True
Name: Height, Length: 1650, dtype: object


dataY.Height = dataY.Height.astype(float)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-25-1fe343f08823> in <module>
----> 1 dataY.Height = dataY.Height.astype(float)

/opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
   5544         else:
   5545             # else, only a single dtype is given
-> 5546             new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,)
   5547             return self._constructor(new_data).__finalize__(self, method="astype")
   5548 

/opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
    593         self, dtype, copy: bool = False, errors: str = "raise"
    594     ) -> "BlockManager":
--> 595         return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
    596 
    597     def convert(

/opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, **kwargs)
    404                 applied = b.apply(f, **kwargs)
    405             else:
--> 406                 applied = getattr(b, f)(**kwargs)
    407             result_blocks = _extend_blocks(applied, result_blocks)
    408 

/opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
    593             vals1d = values.ravel()
    594             try:
--> 595                 values = astype_nansafe(vals1d, dtype, copy=True)
    596             except (ValueError, TypeError):
    597                 # e.g. astype_nansafe can fail on object-dtype of strings

/opt/anaconda3/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
    993     if copy or is_object_dtype(arr) or is_object_dtype(dtype):
    994         # Explicit copy, or required since NumPy can't view from / to object.
--> 995         return arr.astype(dtype, copy=True)
    996 
    997     return arr.view(dtype)

ValueError: could not convert string to float: '70 3/4'


dataY[dataY.Height=="70 3/4"]


dataY[1427:1447]


dataY.Height[1437]=70.75


dataY.Shoe[1437]=10.5

<ipython-input-45-26a6fd170f63>:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataY.Shoe[1437]=10.5


dataY.Height = dataY.Height.astype(float)
dataY.Weight = dataY.Weight.astype(float)
dataY.Shoe = dataY.Shoe.astype(float)
dataY.tv = dataY.tv.astype(float)
dataY.Soda = dataY.Soda.astype(float)


dataY[dataY.Soda==" "]


dataY.Soda[1279]=0

<ipython-input-48-406c15af3e6b>:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataY.Soda[1279]=0


dataY =pd.DataFrame(dataY)


dataY.Height = dataY.Height.astype(float)
dataY.Weight = dataY.Weight.astype(float)
dataY.Shoe = dataY.Shoe.astype(float)
dataY.tv = dataY.tv.astype(float)
dataY.Soda = dataY.Soda.astype(float)


dataY.tail()


len(dataY)*5

8250


plt.hist(dataY.Soda)

(array([927., 386., 167.,  92.,  40.,  21.,   6.,   1.,   7.,   3.]),
 array([ 0.,  6., 12., 18., 24., 30., 36., 42., 48., 54., 60.]),
 <BarContainer object of 10 artists>)


dataY['BMI'] = dataY.Weight/(dataY.Height)**2*703


plt.hist(dataY.BMI)

(array([124., 966., 431., 103.,  21.,   4.,   0.,   0.,   0.,   1.]),
 array([11.9005848 , 19.03005294, 26.15952109, 33.28898924, 40.41845739,
        47.54792553, 54.67739368, 61.80686183, 68.93632998, 76.06579812,
        83.19526627]),
 <BarContainer object of 10 artists>)


dataY.to_csv('data/MAT110Survey/110Survey_fixed.csv')


dataY.to_json('data/MAT110Survey/110Survey_fixed.json')

	Sex	Height	Weight	Shoe	tv	Soda	BMI
1427	F	74.00	180.0	11	6	2	23.108108
1428	F	68.00	140.0	9	7	1	21.284602
1429	F	65.00	203.0	7.5	14	4	33.777278
1430	M	68.00	139.0	11	0	0	21.132569
1431	M	72.00	155.0	12	1	1	21.019483
1432	M	70.00	130.0	8 1/2	10	5	18.651020
1433	M	72.00	220.0	15	96	0	29.834105
1434	M	73.00	220.0	13	4	10	29.022331
1435	F	61.00	125.0	8	10	0	23.615963
1436	F	60.00	120.0	6	3	3	23.433333
1437	F	70.75	170.0	10 1/2	8	5	23.875439
1438	F	62.00	120.0	5.5	5	0	21.945890
1439	F	63.00	132.0	7	5	6	23.380197
1440	M	72.00	250.0	11	35	2	33.902392
1441	M	76.00	300.0	12	50	14	36.513158
1442	M	73.00	200.0	11	0	0	26.383937
1443	M	78.00	196.0	10	5	1	22.647600
1444	M	72.00	208.0	13	20	6	28.206790
1445	F	65.00	200.0	9	35	25	33.278107
1446	F	62.00	165.0	10	2	4	30.175598

MAT411 Week 2¶

Lets look at real data¶

	Q	M	M .1	M .2	M .3	M .4	F	F.1	F.2	F.3	F.4
0	1	71.0	67.0	75	72.0	72.0	69.0	70.0	67	61	67
1	2	162.0	140.0	177	210.0	180.0	136.0	162.0	152	135	150
2	3	10.0	11.0	12	11.0	11.0	9.0	10.0	8	8	7
3	4	35.0	7.0	3	11.0	20.0	63.0	1.0	5	9	8
4	5	7.0	28.0	3	14.0	15.0	21.0	0.0	3	7	3
5	1	70.0	70.0	72	71.0	71.0	68.0	65.0	62	64	64
6	2	140.0	175.0	190	240.0	280.0	124.0	140.0	110	147	125
7	3	10.0	8.0	11	11.0	11.0	7.0	9.0	7.5	8	6.5
8	4	14.0	2.0	2	6.0	10.0	4.0	5.0	1	10	20
9	5	5.0	0.0	20	12.0	10.0	10.0	2.0	1	28	3

	0	1	2	3	4	5	6	7	8	9	...	815	816	817	818	819	820	821	822	823	824
Q	1	2	3	4	5	1	2	3	4	5	...	1	2	3	4	5	1	2	3	4	5
M	71	162	10	35	7	70	140	10	14	5	...	72	172	12	5	0	71	149	12	15	5
M .1	67	140	11	7	28	70	175	8	2	0	...	69	180	12.5	3	3	70	164	11.5	30	11
M .2	75	177	12	3	3	72	190	11	2	20	...	74	176	10	7	5	71	127	10	25	0
M .3	72	210	11	11	14	71	240	11	6	12	...	67	192	11	8	4	72	213	12	0	0
M .4	72	180	11	20	15	71	280	11	10	10	...	68	201	13	6	6	68	180	11	30	20
F	69	136	9	63	21	68	124	7	4	10	...	62	154	8.5	6	0	71	148	6	15	1
F.1	70	162	10	1	0	65	140	9	5	2	...	64	132	9	8	1	65	110	4	5	0
F.2	67	152	8	5	3	62	110	7.5	1	1	...	70	128	9	3	2	64	116	5	5	0
F.3	61	135	8	9	7	64	147	8	10	28	...	63	167	7.5	2	5	70	134	8	15	0
F.4	67	150	7	8	3	64	125	6.5	20	3	...	62	133	7	5	0	73	185	7	30	5

	Sex	Height	Weight	Shoe	tv	Soda
1646	F	71	148	6	15	1
1647	F	65	110	4	5	0
1648	F	64	116	5	5	0
1649	F	70	134	8	15	0
1650	F	73	185	7	30	5

	Sex	Height	Weight	Shoe	tv	Soda
0	M	71	162	10	35	7
1	M	67	140	11	7	28
2	M	75	177	12	3	3
3	M	72	210	11	11	14
4	M	72	180	11	20	15

	Sex	Height	Weight	Shoe	tv	Soda
1645	F	71	148	6	15	1
1646	F	65	110	4	5	0
1647	F	64	116	5	5	0
1648	F	70	134	8	15	0
1649	F	73	185	7	30	5

	Sex	Height	Weight	Shoe	tv	Soda	BMI
1645	F	71.0	148.0	6.0	15.0	1.0	20.639556
1646	F	65.0	110.0	4.0	5.0	0.0	18.302959
1647	F	64.0	116.0	5.0	5.0	0.0	19.909180
1648	F	70.0	134.0	8.0	15.0	0.0	19.224898
1649	F	73.0	185.0	7.0	30.0	5.0	24.405142

	Sex	Height	Weight	Shoe	tv	Soda
0	M	71	162	10	35	7
1	M	67	140	11	7	28
2	M	75	177	12	3	3
3	M	72	210	11	11	14
4	M	72	180	11	20	15

	Sex	Height	Weight	Shoe	tv	Soda
0	M	71	162	10	35	7
1	M	67	140	11	7	28
2	M	75	177	12	3	3
3	M	72	210	11	11	14
4	M	72	180	11	20	15