import pandas as pd
import os, fnmatch
import numpy as np
import matplotlib.pyplot as plt
dataX=pd.read_csv('data/names/yob1995.txt', header=None)
dataX.columns=['Name','Sex','Count']
dataX.head()
Name | Sex | Count | |
---|---|---|---|
0 | Jessica | F | 27938 |
1 | Ashley | F | 26602 |
2 | Emily | F | 24380 |
3 | Samantha | F | 21643 |
4 | Sarah | F | 21388 |
dataX=pd.read_csv('data/namesbystate/MO.TXT', header=None)
dataX.columns=['State','Sex','Year','Name','Count']
dataX.head()
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
0 | MO | F | 1910 | Mary | 611 |
1 | MO | F | 1910 | Helen | 313 |
2 | MO | F | 1910 | Dorothy | 270 |
3 | MO | F | 1910 | Mildred | 267 |
4 | MO | F | 1910 | Ruth | 237 |
Files=os.listdir(os.path.abspath('data/namesbystate/'))
dataY=pd.DataFrame()
for files in fnmatch.filter(Files,'*.TXT'):
dataX=pd.read_csv('data/namesbystate/'+files, header=None)
dataX.columns=['State','Sex','Year','Name','Count']
dataY=pd.concat([dataY,dataX],axis=0)
dataY=dataY.drop_duplicates()
dataY=dataY.reset_index(drop=True)
dataY.tail()
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
6122885 | DE | M | 2019 | River | 5 |
6122886 | DE | M | 2019 | Rocco | 5 |
6122887 | DE | M | 2019 | Shane | 5 |
6122888 | DE | M | 2019 | Syncere | 5 |
6122889 | DE | M | 2019 | Yasir | 5 |
dataY[(dataY.Name=="Sue")&(dataY.Sex=='M')].head(50)
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
1637340 | WI | M | 1989 | Sue | 5 |
3651429 | MN | M | 1988 | Sue | 5 |
4771445 | CA | M | 1983 | Sue | 5 |
4774849 | CA | M | 1985 | Sue | 7 |
4779197 | CA | M | 1987 | Sue | 6 |
4781400 | CA | M | 1988 | Sue | 6 |
4786621 | CA | M | 1990 | Sue | 5 |
4788574 | CA | M | 1991 | Sue | 7 |
4793665 | CA | M | 1993 | Sue | 7 |
4801811 | CA | M | 1996 | Sue | 5 |
M_F_pops = dataY.groupby(['Sex','State','Year','Name']).Count.sum()
M_F_pops.head()
Sex State Year Name F AK 1910 Anna 10 Annie 12 Dorothy 5 Elsie 6 Helen 7 Name: Count, dtype: int64
M_F_pops['M']['MO'][1989].index[427]
'Michael'
dataL=pd.read_csv('data/Life_expectancy.csv',header=None, thousands=',')
dataL.columns=['Age','M_death_prob','M_amount','M_LifeExpect','F_death_prob','F_amount','F_LifeExpect']
dataL.tail(30)
Age | M_death_prob | M_amount | M_LifeExpect | F_death_prob | F_amount | F_LifeExpect | |
---|---|---|---|---|---|---|---|
90 | 90 | 0.163689 | 18303 | 4.08 | 0.129706 | 29685 | 4.85 |
91 | 91 | 0.181104 | 15307 | 3.78 | 0.144636 | 25835 | 4.50 |
92 | 92 | 0.199810 | 12535 | 3.50 | 0.160741 | 22098 | 4.18 |
93 | 93 | 0.219765 | 10030 | 3.25 | 0.177971 | 18546 | 3.88 |
94 | 94 | 0.240913 | 7826 | 3.03 | 0.196270 | 15245 | 3.61 |
95 | 95 | 0.261868 | 5941 | 2.83 | 0.214769 | 12253 | 3.37 |
96 | 96 | 0.282225 | 4385 | 2.66 | 0.233174 | 9622 | 3.16 |
97 | 97 | 0.301555 | 3147 | 2.51 | 0.251158 | 7378 | 2.96 |
98 | 98 | 0.319421 | 2198 | 2.37 | 0.268378 | 5525 | 2.79 |
99 | 99 | 0.335392 | 1496 | 2.25 | 0.284481 | 4042 | 2.63 |
100 | 100 | 0.352162 | 994 | 2.13 | 0.301550 | 2892 | 2.48 |
101 | 101 | 0.369770 | 644 | 2.02 | 0.319643 | 2020 | 2.33 |
102 | 102 | 0.388259 | 406 | 1.91 | 0.338821 | 1374 | 2.19 |
103 | 103 | 0.407672 | 248 | 1.81 | 0.359151 | 909 | 2.06 |
104 | 104 | 0.428055 | 147 | 1.71 | 0.380700 | 582 | 1.93 |
105 | 105 | 0.449458 | 84 | 1.61 | 0.403542 | 361 | 1.81 |
106 | 106 | 0.471931 | 46 | 1.52 | 0.427754 | 215 | 1.69 |
107 | 107 | 0.495527 | 24 | 1.43 | 0.453420 | 123 | 1.58 |
108 | 108 | 0.520304 | 12 | 1.35 | 0.480625 | 67 | 1.47 |
109 | 109 | 0.546319 | 6 | 1.27 | 0.509462 | 35 | 1.37 |
110 | 110 | 0.573635 | 3 | 1.19 | 0.540030 | 17 | 1.27 |
111 | 111 | 0.602317 | 1 | 1.11 | 0.572432 | 8 | 1.18 |
112 | 112 | 0.632432 | 0 | 1.04 | 0.606778 | 3 | 1.09 |
113 | 113 | 0.664054 | 0 | 0.97 | 0.643184 | 1 | 1.01 |
114 | 114 | 0.697257 | 0 | 0.91 | 0.681775 | 0 | 0.93 |
115 | 115 | 0.732119 | 0 | 0.84 | 0.722682 | 0 | 0.86 |
116 | 116 | 0.768725 | 0 | 0.78 | 0.766043 | 0 | 0.79 |
117 | 117 | 0.807162 | 0 | 0.73 | 0.807162 | 0 | 0.73 |
118 | 118 | 0.847520 | 0 | 0.67 | 0.847520 | 0 | 0.67 |
119 | 119 | 0.889896 | 0 | 0.62 | 0.889896 | 0 | 0.62 |
plt.figure(figsize=(13,6))
plt.plot(dataL.Age, dataL.M_amount/100000, label='Male')
plt.plot(dataL.Age, dataL.F_amount/100000, label='Female')
plt.title('probability of being alive at a certain age')
plt.xlabel('Age')
plt.legend()
plt.grid()
plt.show()
Years= dataY.Year.unique()
Years
array([1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])
M_prob_alive = [dataL.iloc[2021-x].M_amount/100000 for x in Years]
F_prob_alive = [dataL.iloc[2021-x].F_amount/100000 for x in Years]
plt.figure(figsize=(13,6))
plt.plot(Years, M_prob_alive, label='Male')
plt.plot(Years, F_prob_alive, label='Female')
plt.title('probability of being alive with a certain birth year, assuming uniform birth rate')
plt.xlabel('Birth Year')
plt.legend()
plt.grid()
plt.show()
Birth_year = dataY.groupby(['Sex','Year']).Count.sum()
Birth_year
Sex Year F 1910 352089 1911 372376 1912 504298 1913 566973 1914 696907 ... M 2015 1676595 2016 1653511 2017 1604609 2018 1568678 2019 1538056 Name: Count, Length: 220, dtype: int64
plt.figure(figsize=(13,6))
plt.plot(Birth_year['M'], label='Male')
plt.plot(Birth_year['F'], label='Female')
plt.title('SSN applications')
plt.xlabel('Year')
plt.legend()
plt.grid()
plt.show()
plt.figure(figsize=(13,6))
plt.plot(Birth_year['M']/(Birth_year['M']+Birth_year['F']), label='Male')
plt.plot(Birth_year['F']/(Birth_year['M']+Birth_year['F']), label='Female')
plt.title('SSN applications')
plt.xlabel('Year')
plt.legend()
plt.grid()
plt.show()
len(Birth_year["F"])
110
F_data = Birth_year["F"]*F_prob_alive/(sum(Birth_year['F']*F_prob_alive))
M_data = Birth_year["M"]*M_prob_alive/(sum(Birth_year['M']*M_prob_alive))
plt.figure(figsize=(13,6))
plt.plot(M_data, label='Male')
plt.plot(F_data, label='Female')
plt.title('probability of being alive with a certain birth year')
plt.xlabel('Birth Year')
plt.legend()
plt.grid()
plt.show()
def Generate_year(Name,Sex):
if Sex=='F':
S_data = F_data
else:
S_data= M_data
Name_data = get_Name_data(Name,Sex)
data_normalized = Name_data*S_data/(sum(Name_data*S_data))
return data_normalized
def get_Name_data(Name,Sex):
dataName = dataY[dataY.Name==Name].groupby(['Sex','Year']).Count.sum()
dataTotal = dataY.groupby(['Sex','Year']).Count.sum()
data_new=[]
for y in M_data.index:
if y in dataName[Sex].index:
data_new.append(dataName[Sex][y]/dataTotal[Sex][y])
else:
data_new.append(0)
data_new=data_new/sum(data_new)
return data_new
Generate_year('Gavin','M')
Year 1910 0.000000 1911 0.000000 1912 0.000000 1913 0.000000 1914 0.000000 ... 2015 0.033014 2016 0.028173 2017 0.023703 2018 0.019722 2019 0.017107 Name: Count, Length: 110, dtype: float64
Names=('Helen','Richard','Mary')
Sexs=('F',"M","F")
plt.figure(figsize=(13,6))
for n in range(0,len(Names)):
data = Generate_year(Names[n],Sexs[n])
plt.plot(data, label=Names[n])
plt.title('probability of birth year given name and sex')
plt.xlabel('Birth Year')
plt.legend()
plt.grid()
plt.show()