import pandas as pd
import os, fnmatch
import numpy as np
import matplotlib.pyplot as plt
dataX=pd.read_csv('data/names/yob1995.txt', header=None)
dataX.columns=['Name','Sex','Count']
dataX.head()
| Name | Sex | Count | |
|---|---|---|---|
| 0 | Jessica | F | 27938 |
| 1 | Ashley | F | 26602 |
| 2 | Emily | F | 24380 |
| 3 | Samantha | F | 21643 |
| 4 | Sarah | F | 21388 |
dataX=pd.read_csv('data/namesbystate/MO.TXT', header=None)
dataX.columns=['State','Sex','Year','Name','Count']
dataX.head()
| State | Sex | Year | Name | Count | |
|---|---|---|---|---|---|
| 0 | MO | F | 1910 | Mary | 611 |
| 1 | MO | F | 1910 | Helen | 313 |
| 2 | MO | F | 1910 | Dorothy | 270 |
| 3 | MO | F | 1910 | Mildred | 267 |
| 4 | MO | F | 1910 | Ruth | 237 |
Files=os.listdir(os.path.abspath('data/namesbystate/'))
dataY=pd.DataFrame()
for files in fnmatch.filter(Files,'*.TXT'):
dataX=pd.read_csv('data/namesbystate/'+files, header=None)
dataX.columns=['State','Sex','Year','Name','Count']
dataY=pd.concat([dataY,dataX],axis=0)
dataY=dataY.drop_duplicates()
dataY=dataY.reset_index(drop=True)
dataY.tail()
| State | Sex | Year | Name | Count | |
|---|---|---|---|---|---|
| 6122885 | DE | M | 2019 | River | 5 |
| 6122886 | DE | M | 2019 | Rocco | 5 |
| 6122887 | DE | M | 2019 | Shane | 5 |
| 6122888 | DE | M | 2019 | Syncere | 5 |
| 6122889 | DE | M | 2019 | Yasir | 5 |
dataY[(dataY.Name=="Sue")&(dataY.Sex=='M')].head(50)
| State | Sex | Year | Name | Count | |
|---|---|---|---|---|---|
| 1637340 | WI | M | 1989 | Sue | 5 |
| 3651429 | MN | M | 1988 | Sue | 5 |
| 4771445 | CA | M | 1983 | Sue | 5 |
| 4774849 | CA | M | 1985 | Sue | 7 |
| 4779197 | CA | M | 1987 | Sue | 6 |
| 4781400 | CA | M | 1988 | Sue | 6 |
| 4786621 | CA | M | 1990 | Sue | 5 |
| 4788574 | CA | M | 1991 | Sue | 7 |
| 4793665 | CA | M | 1993 | Sue | 7 |
| 4801811 | CA | M | 1996 | Sue | 5 |
M_F_pops = dataY.groupby(['Sex','State','Year','Name']).Count.sum()
M_F_pops.head()
Sex State Year Name
F AK 1910 Anna 10
Annie 12
Dorothy 5
Elsie 6
Helen 7
Name: Count, dtype: int64
M_F_pops['M']['MO'][1989].index[427]
'Michael'
dataL=pd.read_csv('data/Life_expectancy.csv',header=None, thousands=',')
dataL.columns=['Age','M_death_prob','M_amount','M_LifeExpect','F_death_prob','F_amount','F_LifeExpect']
dataL.tail(30)
| Age | M_death_prob | M_amount | M_LifeExpect | F_death_prob | F_amount | F_LifeExpect | |
|---|---|---|---|---|---|---|---|
| 90 | 90 | 0.163689 | 18303 | 4.08 | 0.129706 | 29685 | 4.85 |
| 91 | 91 | 0.181104 | 15307 | 3.78 | 0.144636 | 25835 | 4.50 |
| 92 | 92 | 0.199810 | 12535 | 3.50 | 0.160741 | 22098 | 4.18 |
| 93 | 93 | 0.219765 | 10030 | 3.25 | 0.177971 | 18546 | 3.88 |
| 94 | 94 | 0.240913 | 7826 | 3.03 | 0.196270 | 15245 | 3.61 |
| 95 | 95 | 0.261868 | 5941 | 2.83 | 0.214769 | 12253 | 3.37 |
| 96 | 96 | 0.282225 | 4385 | 2.66 | 0.233174 | 9622 | 3.16 |
| 97 | 97 | 0.301555 | 3147 | 2.51 | 0.251158 | 7378 | 2.96 |
| 98 | 98 | 0.319421 | 2198 | 2.37 | 0.268378 | 5525 | 2.79 |
| 99 | 99 | 0.335392 | 1496 | 2.25 | 0.284481 | 4042 | 2.63 |
| 100 | 100 | 0.352162 | 994 | 2.13 | 0.301550 | 2892 | 2.48 |
| 101 | 101 | 0.369770 | 644 | 2.02 | 0.319643 | 2020 | 2.33 |
| 102 | 102 | 0.388259 | 406 | 1.91 | 0.338821 | 1374 | 2.19 |
| 103 | 103 | 0.407672 | 248 | 1.81 | 0.359151 | 909 | 2.06 |
| 104 | 104 | 0.428055 | 147 | 1.71 | 0.380700 | 582 | 1.93 |
| 105 | 105 | 0.449458 | 84 | 1.61 | 0.403542 | 361 | 1.81 |
| 106 | 106 | 0.471931 | 46 | 1.52 | 0.427754 | 215 | 1.69 |
| 107 | 107 | 0.495527 | 24 | 1.43 | 0.453420 | 123 | 1.58 |
| 108 | 108 | 0.520304 | 12 | 1.35 | 0.480625 | 67 | 1.47 |
| 109 | 109 | 0.546319 | 6 | 1.27 | 0.509462 | 35 | 1.37 |
| 110 | 110 | 0.573635 | 3 | 1.19 | 0.540030 | 17 | 1.27 |
| 111 | 111 | 0.602317 | 1 | 1.11 | 0.572432 | 8 | 1.18 |
| 112 | 112 | 0.632432 | 0 | 1.04 | 0.606778 | 3 | 1.09 |
| 113 | 113 | 0.664054 | 0 | 0.97 | 0.643184 | 1 | 1.01 |
| 114 | 114 | 0.697257 | 0 | 0.91 | 0.681775 | 0 | 0.93 |
| 115 | 115 | 0.732119 | 0 | 0.84 | 0.722682 | 0 | 0.86 |
| 116 | 116 | 0.768725 | 0 | 0.78 | 0.766043 | 0 | 0.79 |
| 117 | 117 | 0.807162 | 0 | 0.73 | 0.807162 | 0 | 0.73 |
| 118 | 118 | 0.847520 | 0 | 0.67 | 0.847520 | 0 | 0.67 |
| 119 | 119 | 0.889896 | 0 | 0.62 | 0.889896 | 0 | 0.62 |
plt.figure(figsize=(13,6))
plt.plot(dataL.Age, dataL.M_amount/100000, label='Male')
plt.plot(dataL.Age, dataL.F_amount/100000, label='Female')
plt.title('probability of being alive at a certain age')
plt.xlabel('Age')
plt.legend()
plt.grid()
plt.show()
Years= dataY.Year.unique()
Years
array([1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920,
1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931,
1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942,
1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953,
1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964,
1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975,
1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986,
1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997,
1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,
2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])
M_prob_alive = [dataL.iloc[2021-x].M_amount/100000 for x in Years]
F_prob_alive = [dataL.iloc[2021-x].F_amount/100000 for x in Years]
plt.figure(figsize=(13,6))
plt.plot(Years, M_prob_alive, label='Male')
plt.plot(Years, F_prob_alive, label='Female')
plt.title('probability of being alive with a certain birth year, assuming uniform birth rate')
plt.xlabel('Birth Year')
plt.legend()
plt.grid()
plt.show()
Birth_year = dataY.groupby(['Sex','Year']).Count.sum()
Birth_year
Sex Year
F 1910 352089
1911 372376
1912 504298
1913 566973
1914 696907
...
M 2015 1676595
2016 1653511
2017 1604609
2018 1568678
2019 1538056
Name: Count, Length: 220, dtype: int64
plt.figure(figsize=(13,6))
plt.plot(Birth_year['M'], label='Male')
plt.plot(Birth_year['F'], label='Female')
plt.title('SSN applications')
plt.xlabel('Year')
plt.legend()
plt.grid()
plt.show()
plt.figure(figsize=(13,6))
plt.plot(Birth_year['M']/(Birth_year['M']+Birth_year['F']), label='Male')
plt.plot(Birth_year['F']/(Birth_year['M']+Birth_year['F']), label='Female')
plt.title('SSN applications')
plt.xlabel('Year')
plt.legend()
plt.grid()
plt.show()
len(Birth_year["F"])
110
F_data = Birth_year["F"]*F_prob_alive/(sum(Birth_year['F']*F_prob_alive))
M_data = Birth_year["M"]*M_prob_alive/(sum(Birth_year['M']*M_prob_alive))
plt.figure(figsize=(13,6))
plt.plot(M_data, label='Male')
plt.plot(F_data, label='Female')
plt.title('probability of being alive with a certain birth year')
plt.xlabel('Birth Year')
plt.legend()
plt.grid()
plt.show()
def Generate_year(Name,Sex):
if Sex=='F':
S_data = F_data
else:
S_data= M_data
Name_data = get_Name_data(Name,Sex)
data_normalized = Name_data*S_data/(sum(Name_data*S_data))
return data_normalized
def get_Name_data(Name,Sex):
dataName = dataY[dataY.Name==Name].groupby(['Sex','Year']).Count.sum()
dataTotal = dataY.groupby(['Sex','Year']).Count.sum()
data_new=[]
for y in M_data.index:
if y in dataName[Sex].index:
data_new.append(dataName[Sex][y]/dataTotal[Sex][y])
else:
data_new.append(0)
data_new=data_new/sum(data_new)
return data_new
Generate_year('Gavin','M')
Year
1910 0.000000
1911 0.000000
1912 0.000000
1913 0.000000
1914 0.000000
...
2015 0.033014
2016 0.028173
2017 0.023703
2018 0.019722
2019 0.017107
Name: Count, Length: 110, dtype: float64
Names=('Helen','Richard','Mary')
Sexs=('F',"M","F")
plt.figure(figsize=(13,6))
for n in range(0,len(Names)):
data = Generate_year(Names[n],Sexs[n])
plt.plot(data, label=Names[n])
plt.title('probability of birth year given name and sex')
plt.xlabel('Birth Year')
plt.legend()
plt.grid()
plt.show()