Week 4, Linear Regression, Curve fitting, Normal Distributions and histograms

In [38]:
import matplotlib.pyplot as plt
import numpy as np

%config InlineBackend.figure_format='retina'

lets create random points, in a linear way with noise

$ y = mx+b$, I am going to write it like $y = \beta x + \alpha $

In [39]:
b = np.random.normal(10,3)
a = np.random.normal(0,3)
In [40]:
def func(x):
    noise = np.random.normal(0,1)
    return b*x+a+noise
In [46]:
X = list(np.linspace(0,1,50))
Y =[]
for x in X:
    Y.append(func(x))
In [47]:
plt.plot(X,Y,'x')
Out[47]:
[<matplotlib.lines.Line2D at 0x7fa3107632d0>]

$$ y_i = \alpha + \beta x_i + \epsilon_i $$

The Least Squares estimates, slope

$$ \beta = \frac{\sum (y_i - \bar{y})(x_i - \bar{x})}{ \sum (x_i - \bar{x})^2 } $$

and intercept is

$$ \alpha = \bar{y} - \beta \bar{x} $$

In [48]:
Xmean=np.mean(X)
Ymean=np.mean(Y)
Ymean
Out[48]:
9.26418443747414
In [49]:
Top=0
Bottom=0
for c in range(0,len(X)):
    Top +=(Y[c]-Ymean)*(X[c]-Xmean)
    Bottom += (X[c]-Xmean)**2

B = Top/Bottom
A = Ymean - B*Xmean
In [50]:
L=[]
for x in X:
    L.append(x*B+A)
    
plt.plot(X,Y,'x')
plt.plot(X,L,c='r')
Out[50]:
[<matplotlib.lines.Line2D at 0x7fa318cb8850>]
In [57]:
XX = np.linspace(0,1,50)

def func_line(x):
    return x*B+A
plt.plot(X,Y,'x')
plt.plot(XX,func_line(XX),c='r')
plt.title('Linear Regression line')
plt.show()
In [59]:
print('we originally picked the value a =',a,': But our linear regression gave us A=',A)
print('we originally picked the value b =',b,': But our linear regression gave us B=',B)
we originally picked the value a = 6.320391729352523 : But our linear regression gave us A= 6.2097816830734525
we originally picked the value b = 5.75029991005351 : But our linear regression gave us B= 6.108805508801376
In [65]:
b = np.random.normal(10,3)
a = np.random.normal(0,3)
n=500
noise_level = 0.5

def func(x):
    noise = np.random.normal(0,noise_level)
    return b*x+a+noise

X = list(np.linspace(0,1,n))
Y =[]
for x in X:
    Y.append(func(x))
    
Xmean=np.mean(X)
Ymean=np.mean(Y)

Top=0
Bottom=0
for c in range(0,len(X)):
    Top +=(Y[c]-Ymean)*(X[c]-Xmean)
    Bottom += (X[c]-Xmean)**2

B = Top/Bottom
A = Ymean - B*Xmean

L=[]
for x in X:
    L.append(x*B+A)
    
plt.plot(X,Y,'x',alpha = 0.2)
plt.plot(X,L,c='r')
plt.title('Linear Regression line')
plt.show()


print('we originally picked the value a =',a,': But our linear regression gave us A=',A)
print('we originally picked the value b =',b,': But our linear regression gave us B=',B)
we originally picked the value a = 0.4235257856813822 : But our linear regression gave us A= 0.33171839557927196
we originally picked the value b = 7.02354220808323 : But our linear regression gave us B= 7.161291261449563
In [69]:
Fitting = np.polyfit(X,Y,1)
Fitting
Out[69]:
array([7.16129126, 0.3317184 ])
In [ ]:
Fitting = np.polyfit

Look at Higher dimensions

In [81]:
a = np.random.normal(10,3)
b = np.random.normal(0,3)
c = np.random.normal(5,1)

n=20
noise_level = 0.5

Lets create the parabola with noise

In [82]:
def func(x):
    noise = np.random.normal(0,noise_level)
    return a*x**2+b*x+c+noise
In [83]:
X = list(np.linspace(-1,1,n))
Y =[]
for x in X:
    Y.append(func(x))
In [84]:
plt.plot(X,Y,'x')
Out[84]:
[<matplotlib.lines.Line2D at 0x7fa318f46990>]
In [86]:
my_fit= np.polyfit(X,Y,2)
my_fit
Out[86]:
array([13.48470613,  1.4062918 ,  5.71469352])
In [88]:
my_func_fit = np.poly1d(my_fit)
In [89]:
my_func_fit(-0.25)
Out[89]:
6.205914706453705
In [93]:
plt.plot(X,Y,'x')
plt.plot(X,my_func_fit(X),color = 'r')
plt.title('curve fitting, parabola')
plt.show()
In [97]:
a = np.random.normal(10,3)
b = np.random.normal(0,3)
c = np.random.normal(5,1)

n=2000
noise_level = 2

X = list(np.linspace(-1,1,n))
Y =[]
for x in X:
    Y.append(func(x))
    
my_fit= np.polyfit(X,Y,2)
my_func_fit = np.poly1d(my_fit)

plt.plot(X,Y,'x',alpha = 0.2)
plt.plot(X,my_func_fit(X),color = 'r')
plt.title('curve fitting, parabola')
plt.show()
In [100]:
print(my_func_fit.coef)
print(a,b,c)
[10.51114104  1.14742487  4.86526649]
10.592239211394265 1.1586657036901373 4.7619789167750035

Lets look at normal and uniform distributions

In [101]:
dataX = np.random.random(10)
In [102]:
dataX
Out[102]:
array([0.87980287, 0.39819694, 0.57287667, 0.6088297 , 0.14483958,
       0.53828841, 0.82697597, 0.57622236, 0.6278    , 0.74009547])
In [103]:
np.histogram(dataX)
Out[103]:
(array([1, 0, 0, 1, 0, 3, 2, 0, 1, 2]),
 array([0.14483958, 0.21833591, 0.29183224, 0.36532857, 0.4388249 ,
        0.51232122, 0.58581755, 0.65931388, 0.73281021, 0.80630654,
        0.87980287]))
In [108]:
my_histo = np.histogram(dataX)
my_histo[1]
Out[108]:
array([0.14483958, 0.21833591, 0.29183224, 0.36532857, 0.4388249 ,
       0.51232122, 0.58581755, 0.65931388, 0.73281021, 0.80630654,
       0.87980287])
In [110]:
plt.plot(my_histo[1][:-1],my_histo[0],'.')
Out[110]:
[<matplotlib.lines.Line2D at 0x7fa330703d10>]
In [138]:
dataX = np.random.random(100000)
my_histo = np.histogram(dataX,10)
plt.plot(my_histo[1][:-1],my_histo[0],'.-')
plt.ylim(0,(max(my_histo[0])*1.1))
plt.show()
In [150]:
dataX = np.random.normal(0,1,10000000)
my_histo = np.histogram(dataX,100)
plt.plot(my_histo[1][:-1],my_histo[0],'.-')
plt.ylim(0,(max(my_histo[0])*1.1))
plt.show()
In [156]:
plt.hist(dataX,100)
plt.show()
In [159]:
dataX = np.random.normal(0,1,1000000)
dataY = np.random.normal(1,2,1000000)

plt.hist(dataX,100,alpha=0.2)
plt.hist(dataY,100,alpha=0.2, color='r')
plt.show()

lets look at normalized experiment

In [179]:
n= 10000
breaks = 100

stacks = np.zeros(2*breaks+1)

for ball in range(0,n):
    position = breaks
    for bounce in range(0,breaks):
        if np.random.random()>=0.5:
            position +=1
        else:
            position +=-1
            
    stacks[position] +=1

    
X=list(range(0,2*breaks+1))
plt.plot(X,stacks,'.')
plt.show()
In [180]:
stacks
Out[180]:
array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,   0.,
         1.,   0.,  12.,   0.,   8.,   0.,  21.,   0.,  26.,   0.,  43.,
         0.,  69.,   0.,  89.,   0., 142.,   0., 240.,   0., 299.,   0.,
       400.,   0., 476.,   0., 589.,   0., 651.,   0., 746.,   0., 751.,
         0., 768.,   0., 780.,   0., 736.,   0., 697.,   0., 556.,   0.,
       488.,   0., 406.,   0., 285.,   0., 237.,   0., 156.,   0., 118.,
         0.,  87.,   0.,  41.,   0.,  35.,   0.,  20.,   0.,  12.,   0.,
         9.,   0.,   1.,   0.,   2.,   0.,   1.,   0.,   1.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.])
In [187]:
%%time
n= 100000
breaks = 10

stacks = np.zeros(2*breaks+1)

for ball in range(0,n):
    position = breaks
    for bounce in range(0,breaks):
        position +=np.random.choice([-1,0,1])
    stacks[position] +=1

    
X=list(range(0,2*breaks+1))
plt.plot(X,stacks,'.-')
plt.show()
CPU times: user 14.2 s, sys: 814 ms, total: 15.1 s
Wall time: 16.4 s
In [199]:
%%time
n= 1000000
breaks = 10

stacks = np.zeros(2*breaks+1)

position = np.ones(n)*breaks

for bounce in range(0,breaks):
    position +=np.random.choice([-1,0,1],n)

for x in position:
    stacks[int(x)] +=1
    
X=list(range(0,2*breaks+1))
plt.plot(X,stacks,'.-')
plt.show()
CPU times: user 1.52 s, sys: 145 ms, total: 1.66 s
Wall time: 1.84 s
In [193]:
stacks
Out[193]:
array([0., 1., 2., 4., 2., 1., 0.])
In [191]:
np.random.choice([-1,0,1],n)
Out[191]:
array([-1,  1,  0, -1,  0,  0,  1, -1,  0, -1])
In [ ]: