import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline


YY = np.random.random(20)
X = np.linspace(0,10,20)
Y = 2*YY+X

plt.scatter(X,Y)

<matplotlib.collections.PathCollection at 0x11746d908>


LinReg = stats.linregress(X,Y)
print(LinReg)
LinReg[0]

LinregressResult(slope=0.97413179952641371, intercept=1.1956568957765246, rvalue=0.98487751106379229, pvalue=3.7190843392567215e-15, stderr=0.040390361541795157)

0.97413179952641371


plt.scatter(X,Y)
plt.plot(X,LinReg[0]*X+LinReg[1])

[<matplotlib.lines.Line2D at 0x1150706a0>]


sns.regplot(X,Y,ci=None)

<matplotlib.axes._subplots.AxesSubplot at 0x117aa7710>


sns.regplot(X,Y,ci=78)

<matplotlib.axes._subplots.AxesSubplot at 0x117cca358>


sns.residplot(X,Y)

<matplotlib.axes._subplots.AxesSubplot at 0x117cfb7f0>


plt.scatter(X,YY)

<matplotlib.collections.PathCollection at 0x118146be0>


sns.jointplot(X,Y, kind="kde")

<seaborn.axisgrid.JointGrid at 0x1181510f0>


sns.kdeplot(X,Y,shade=True)

<matplotlib.axes._subplots.AxesSubplot at 0x118411f98>


from pandas.stats.api import ols


dataY = pd.read_excel('chsi_dataset/CHSI DataSet.xls',sheetname="DEMOGRAPHICS")


dataY.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3141 entries, 0 to 3140
Data columns (total 44 columns):
State_FIPS_Code               3141 non-null int64
County_FIPS_Code              3141 non-null int64
CHSI_County_Name              3141 non-null object
CHSI_State_Name               3141 non-null object
CHSI_State_Abbr               3141 non-null object
Strata_ID_Number              3141 non-null int64
Strata_Determining_Factors    3141 non-null object
Number_Counties               3141 non-null int64
Population_Size               3141 non-null int64
Min_Population_Size           3141 non-null int64
Max_Population_Size           3141 non-null int64
Population_Density            3141 non-null int64
Min_Population_Density        3141 non-null int64
Max_Population_Density        3141 non-null int64
Poverty                       3141 non-null float64
Min_Poverty                   3141 non-null float64
Max_Poverty                   3141 non-null float64
Age_19_Under                  3141 non-null float64
Min_Age_19_Under              3141 non-null float64
Max_Age_19_Under              3141 non-null float64
Age_19_64                     3141 non-null float64
Min_Age_19_64                 3141 non-null float64
Max_Age_19_65                 3141 non-null float64
Age_65_84                     3141 non-null float64
Min_Age_65_84                 3141 non-null float64
Max_Age_65_85                 3141 non-null float64
Age_85_and_Over               3141 non-null float64
Min_Age_85_and_Over           3141 non-null float64
Max_Age_85_and_Over           3141 non-null float64
White                         3141 non-null float64
Min_White                     3141 non-null float64
Max_White                     3141 non-null float64
Black                         3141 non-null float64
Min_Black                     3141 non-null float64
Max_Black                     3141 non-null float64
Native_American               3141 non-null float64
Min_Native_American           3141 non-null float64
Max_Native_American           3141 non-null float64
Asian                         3141 non-null float64
Min_Asian                     3141 non-null float64
Max_Asian                     3141 non-null float64
Hispanic                      3141 non-null float64
Min_Hispanic                  3141 non-null float64
Max_Hispanic                  3141 non-null float64
dtypes: float64(30), int64(10), object(4)
memory usage: 1.1+ MB


dataY = dataY[['Poverty','State_FIPS_Code','County_FIPS_Code','Population_Size','Age_65_84']]


dataY.head()


plt.scatter(dataY.Poverty,dataY.Age_65_84)

<matplotlib.collections.PathCollection at 0x11d99c3c8>


dataY = dataY[dataY.Poverty>=0]


plt.scatter(dataY.Poverty,dataY.Age_65_84)

<matplotlib.collections.PathCollection at 0x11dd4d048>


g = sns.kdeplot(dataY.Poverty,dataY.Age_65_84,shade=True)
g.set_xlim(0,35)
g.set_ylim(0,30)

(0, 30)


sns.regplot(dataY.Poverty,dataY.Age_65_84)

<matplotlib.axes._subplots.AxesSubplot at 0x11debd048>


ols(y=dataY.Poverty,x=dataY.Age_65_84)

-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <x> + <intercept>

Number of Observations:         3140
Number of Degrees of Freedom:   2

R-squared:         0.0000
Adj R-squared:    -0.0003

Rmse:              4.8840

F-stat (1, 3138):     0.0974, p-value:     0.7550

Degrees of Freedom: model 1, resid 3138

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
             x    -0.0082     0.0261      -0.31     0.7550    -0.0594     0.0431
     intercept    13.4545     0.3455      38.94     0.0000    12.7772    14.1317
---------------------------------End of Summary---------------------------------


stats.linregress(dataY.Poverty,dataY.Age_65_84)

LinregressResult(slope=-0.0038033633506301401, intercept=12.840552576425711, rvalue=-0.0055699336836162331, pvalue=0.75504561779548907, stderr=0.012189463361527582)


dataX = pd.read_excel('chsi_dataset/CHSI DataSet.xls',sheetname="RISKFACTORSANDACCESSTOCARE")


dataX = dataX[['State_FIPS_Code','County_FIPS_Code','Obesity','Smoker']]
dataX = dataX[(dataX.Obesity>=0)&(dataX.Smoker>=0)] # Again, taking care of "blips"


plt.scatter(dataX.Obesity,dataX.Smoker)

<matplotlib.collections.PathCollection at 0x11d93d978>


g = sns.kdeplot(dataX.Obesity,dataX.Smoker,shade=True)
g.set_xlim(2,45)
g.set_ylim(0,50)

(0, 50)


sns.regplot(dataX.Obesity,dataX.Smoker)

<matplotlib.axes._subplots.AxesSubplot at 0x11d23e898>


ols(x=dataX.Obesity,y=dataX.Smoker)

-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <x> + <intercept>

Number of Observations:         2108
Number of Degrees of Freedom:   2

R-squared:         0.1712
Adj R-squared:     0.1708

Rmse:              5.0683

F-stat (1, 2106):   434.8750, p-value:     0.0000

Degrees of Freedom: model 1, resid 2106

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
             x     0.4841     0.0232      20.85     0.0000     0.4386     0.5296
     intercept    11.7003     0.5754      20.34     0.0000    10.5726    12.8280
---------------------------------End of Summary---------------------------------


stats.linregress(dataX.Obesity,dataX.Smoker)

LinregressResult(slope=0.48407652227823772, intercept=11.700283669750053, rvalue=0.41370481483170812, pvalue=5.9802577013953361e-88, stderr=0.023213027902751975)

	Poverty	State_FIPS_Code	County_FIPS_Code	Population_Size	Age_65_84
0	10.4	1	1	48612	9.8
1	10.2	1	3	162586	14.5
2	22.1	1	5	28414	11.6
3	16.8	1	7	21516	10.9
4	11.9	1	9	55725	12.1

Linear regression and pretty plots¶

A larger example using Pandas¶

Example one¶

Example two¶