import pandas as pd
import os, fnmatch
import matplotlib.pyplot as plt
%matplotlib inline


dataX = pd.read_csv('names/yob1880.txt',header=None)


dataX.head()


dataX.columns =('Name','Sex','number')
dataX.head()


dataX['Year']=int(1880)
dataX.head()


dataY=pd.read_csv('names/yob1881.txt',header=None)
dataY.columns =('Name','Sex','number')
dataY['Year']=int(1881)
dataY.head()


dataXY = pd.concat([dataX,dataY])


print('The top of my new data frame')
print(dataXY.head(3))
print('and the bottom of my new data frame')
print(dataXY.tail(3))

The top of my new data frame
   Name Sex  number  Year
0  Mary   F    7065  1880
1  Anna   F    2604  1880
2  Emma   F    2003  1880
and the bottom of my new data frame
        Name Sex  number  Year
1932    Wing   M       5  1881
1933    Wood   M       5  1881
1934  Wright   M       5  1881


len(dataXY)

3935


dataXY=dataXY.reset_index(drop=True)
dataXY.tail()


dataX=pd.read_csv('names/yob1880.txt',header=None)
dataX.columns =('Name','Sex','number')
dataX['Year']=int(1880)


files=os.listdir(os.path.abspath("names"))
for filename in fnmatch.filter(files, "*.txt"):
    dataY=pd.read_csv('names/'+filename,header=None)
    dataY.columns =('Name','Sex','number')
    dataY['Year']=int(filename[3:7])
    dataX = pd.concat([dataX,dataY])


print(len(dataX))
dataX.tail()

1860689


dataX = dataX.drop_duplicates()
dataX=dataX.reset_index(drop=True)
dataX=pd.DataFrame(dataX)
print(len(dataX))
print(dataX.tail())

1858689
           Name Sex  number  Year
1858684  Zykell   M       5  2015
1858685  Zyking   M       5  2015
1858686   Zykir   M       5  2015
1858687   Zyrus   M       5  2015
1858688    Zyus   M       5  2015


print('The total number of names registered for a SNN since 1880 is :' + str(dataX.number.sum()))
print('The highest name registered in one particular year was :'+ str(dataX.number.max()))

The total number of names registered for a SNN since 1880 is :340851912
The highest name registered in one particular year was :99680


dataX[dataX.number==99680].head()


dataXF = dataX[dataX.Sex=="F"]
dataXM = dataX[dataX.Sex=="M"]


print(dataXM[dataXM.number==dataXM.number.max()])
print(dataXF[dataXF.number==dataXF.number.max()])

         Name Sex  number  Year
437156  James   M   94763  1947
         Name Sex  number  Year
431053  Linda   F   99680  1947


print('Number of different Male names :' + str(dataXM.Name.nunique()))
print('Number of different Female names :' + str(dataXF.Name.nunique()))

Number of different Male names :39728
Number of different Female names :65658


print('Number of Males :' + str(dataXM.number.sum()))
print('Number of Females:' + str(dataXF.number.sum()))

Number of Males :171990331
Number of Females:168861581


dataNF = dataXF[dataXF.Name=="Linda"]
plt.plot(dataNF.Year,dataNF.number)
dataNM = dataXM[dataXM.Name=="James"]
plt.plot(dataNM.Year,dataNM.number)

[<matplotlib.lines.Line2D at 0x11a2e6e10>]


dataXFT = dataXF.groupby(('Year')).number.sum()
dataXMT = dataXM.groupby(('Year')).number.sum()
plt.plot(dataXFT)
plt.plot(dataXMT)

[<matplotlib.lines.Line2D at 0x11a4e7080>]


print('The maximum female registrations happened in : '+str(dataXFT.argmax()) + ', with a total number of females being registered: '+ str(dataXFT.max()))
print('The maximum male registrations happened in : '+str(dataXMT.argmax()) + ', with a total number of males being registered: '+ str(dataXMT.max()))

The maximum female registrations happened in : 1957, with a total number of females being registered: 2044225
The maximum male registrations happened in : 1957, with a total number of males being registered: 2155921


dataP = pd.read_csv('names/USpop.csv')
dataP = dataP[['Date','Pop']]
dataP=dataP.set_index('Date')
dataP.head()
plt.plot(dataP)

[<matplotlib.lines.Line2D at 0x11af56198>]


# make a dictionary
TotalPop = pd.Series(dataP.Pop.values, index=dataP.index).to_dict()


dataX = dataX[dataX.Year>=1900]
dataX['TotalPop'] = dataX.Year.map(TotalPop)
dataX['PopPercent'] = dataX.number/dataX.TotalPop
dataX.head()


dataXF = dataX[dataX.Sex=="F"]
dataXM = dataX[dataX.Sex=="M"]


print(dataXM[dataXM.PopPercent==dataXM.PopPercent.max()])
print(dataXF[dataXF.PopPercent==dataXF.PopPercent.max()])

         Name Sex  number  Year   TotalPop  PopPercent
437156  James   M   94763  1947  144130000    0.000657
         Name Sex  number  Year   TotalPop  PopPercent
431053  Linda   F   99680  1947  144130000    0.000692


dataNF = dataXF[dataXF.Name=="Linda"]
plt.plot(dataNF.Year,dataNF.PopPercent)
dataNM = dataXM[dataXM.Name=="James"]
plt.plot(dataNM.Year,dataNM.PopPercent)

[<matplotlib.lines.Line2D at 0x11a2e6780>]


dataXFT = dataXF.groupby(('Year')).PopPercent.sum()
dataXMT = dataXM.groupby(('Year')).PopPercent.sum()
plt.plot(dataXFT)
plt.plot(dataXMT)

[<matplotlib.lines.Line2D at 0x1392677b8>]


print('The maximum percentage growth of female registrations happened in : '+str(dataXFT.argmax()) + ', with a total growth rate of females being registered: '+ str(dataXFT.max()*2))
print('The maximum percentage growth of male registrations happened in : '+str(dataXMT.argmax()) + ', with a total growth rate of males being registered: '+ str(dataXMT.max()*2))

The maximum percentage growth of female registrations happened in : 1947, with a total growth rate of females being registered: 0.0245979324221
The maximum percentage growth of male registrations happened in : 1947, with a total growth rate of males being registered: 0.0253839311732


print("The growth rate of females in 1947 was: " +str(dataXFT.max()*100*2)+'%')
print("But in 2015, the rate was : " +str(dataXFT[2015]*100*2)+'%')

The growth rate of females in 1947 was: 2.45979324221%
But in 2015, the rate was : 1.10506839048%

	Name	Sex	number	Year	TotalPop	PopPercent
52265	Mary	F	16707	1900	76090000	0.000220
52266	Helen	F	6343	1900	76090000	0.000083
52267	Anna	F	6114	1900	76090000	0.000080
52268	Margaret	F	5304	1900	76090000	0.000070
52269	Ruth	F	4765	1900	76090000	0.000063

Pandas - Basic¶

Next step building onto our Data Frame¶

	Name	Sex	number	Year
0	Mary	F	7065	1880
1	Anna	F	2604	1880
2	Emma	F	2003	1880
3	Elizabeth	F	1939	1880
4	Minnie	F	1746	1880

	Name	Sex	number	Year
0	Mary	F	6919	1881
1	Anna	F	2698	1881
2	Emma	F	2034	1881
3	Elizabeth	F	1852	1881
4	Margaret	F	1658	1881

	Name	Sex	number	Year
3930	Wiliam	M	5	1881
3931	Wilton	M	5	1881
3932	Wing	M	5	1881
3933	Wood	M	5	1881
3934	Wright	M	5	1881

	Name	Sex	number	Year
32947	Zykell	M	5	2015
32948	Zyking	M	5	2015
32949	Zykir	M	5	2015
32950	Zyrus	M	5	2015
32951	Zyus	M	5	2015