import pandas as pd
import os, fnmatch
import numpy as np
import matplotlib.pyplot as plt


dataX=pd.read_csv('data/names/yob1995.txt', header=None)
dataX.columns=['Name','Sex','Count']
dataX.head()


dataX=pd.read_csv('data/namesbystate/MO.TXT', header=None)
dataX.columns=['State','Sex','Year','Name','Count']
dataX.head()


Files=os.listdir(os.path.abspath('data/namesbystate/'))
dataY=pd.DataFrame()
for files in fnmatch.filter(Files,'*.TXT'):
    dataX=pd.read_csv('data/namesbystate/'+files, header=None)
    dataX.columns=['State','Sex','Year','Name','Count']
    dataY=pd.concat([dataY,dataX],axis=0)


dataY=dataY.drop_duplicates()
dataY=dataY.reset_index(drop=True)


dataY.tail()


dataY[(dataY.Name=="Sue")&(dataY.Sex=='M')].head(50)


M_F_pops = dataY.groupby(['Sex','State','Year','Name']).Count.sum()
M_F_pops.head()

Sex  State  Year  Name   
F    AK     1910  Anna       10
                  Annie      12
                  Dorothy     5
                  Elsie       6
                  Helen       7
Name: Count, dtype: int64


M_F_pops['M']['MO'][1989].index[427]

'Michael'


dataL=pd.read_csv('data/Life_expectancy.csv',header=None, thousands=',')
dataL.columns=['Age','M_death_prob','M_amount','M_LifeExpect','F_death_prob','F_amount','F_LifeExpect']
dataL.tail(30)


plt.figure(figsize=(13,6))
plt.plot(dataL.Age, dataL.M_amount/100000, label='Male')
plt.plot(dataL.Age, dataL.F_amount/100000, label='Female')

plt.title('probability of being alive at a certain age')
plt.xlabel('Age')
plt.legend()
plt.grid()
plt.show()


Years= dataY.Year.unique()
Years

array([1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920,
       1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931,
       1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942,
       1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953,
       1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964,
       1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975,
       1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986,
       1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997,
       1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,
       2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])


M_prob_alive = [dataL.iloc[2021-x].M_amount/100000 for x in Years]
F_prob_alive = [dataL.iloc[2021-x].F_amount/100000 for x in Years]


plt.figure(figsize=(13,6))
plt.plot(Years, M_prob_alive, label='Male')
plt.plot(Years,  F_prob_alive, label='Female')

plt.title('probability of being alive with a certain birth year, assuming uniform birth rate')
plt.xlabel('Birth Year')
plt.legend()
plt.grid()
plt.show()


Birth_year = dataY.groupby(['Sex','Year']).Count.sum()


Birth_year

Sex  Year
F    1910     352089
     1911     372376
     1912     504298
     1913     566973
     1914     696907
              ...   
M    2015    1676595
     2016    1653511
     2017    1604609
     2018    1568678
     2019    1538056
Name: Count, Length: 220, dtype: int64


plt.figure(figsize=(13,6))
plt.plot(Birth_year['M'], label='Male')
plt.plot(Birth_year['F'], label='Female')

plt.title('SSN applications')
plt.xlabel('Year')
plt.legend()
plt.grid()
plt.show()


plt.figure(figsize=(13,6))
plt.plot(Birth_year['M']/(Birth_year['M']+Birth_year['F']), label='Male')
plt.plot(Birth_year['F']/(Birth_year['M']+Birth_year['F']), label='Female')

plt.title('SSN applications')
plt.xlabel('Year')
plt.legend()
plt.grid()
plt.show()


len(Birth_year["F"])

110


F_data = Birth_year["F"]*F_prob_alive/(sum(Birth_year['F']*F_prob_alive))
M_data = Birth_year["M"]*M_prob_alive/(sum(Birth_year['M']*M_prob_alive))


plt.figure(figsize=(13,6))
plt.plot(M_data, label='Male')
plt.plot(F_data, label='Female')

plt.title('probability of being alive with a certain birth year')
plt.xlabel('Birth Year')
plt.legend()
plt.grid()
plt.show()


def Generate_year(Name,Sex):
    if Sex=='F':
        S_data = F_data
    else:
        S_data= M_data
        
    Name_data = get_Name_data(Name,Sex)
    
    data_normalized = Name_data*S_data/(sum(Name_data*S_data))
    
    return data_normalized


def get_Name_data(Name,Sex):
    
    dataName = dataY[dataY.Name==Name].groupby(['Sex','Year']).Count.sum()
    
    dataTotal = dataY.groupby(['Sex','Year']).Count.sum()
    
    data_new=[]
    
    for y in M_data.index:
        if y in dataName[Sex].index:
            data_new.append(dataName[Sex][y]/dataTotal[Sex][y])
        else:
            data_new.append(0)
    data_new=data_new/sum(data_new)

    return data_new


Generate_year('Gavin','M')

Year
1910    0.000000
1911    0.000000
1912    0.000000
1913    0.000000
1914    0.000000
          ...   
2015    0.033014
2016    0.028173
2017    0.023703
2018    0.019722
2019    0.017107
Name: Count, Length: 110, dtype: float64


Names=('Helen','Richard','Mary')
Sexs=('F',"M","F")
plt.figure(figsize=(13,6))

for n in range(0,len(Names)):
    data = Generate_year(Names[n],Sexs[n])
    plt.plot(data, label=Names[n])

plt.title('probability of birth year given name and sex')
plt.xlabel('Birth Year')
plt.legend()
plt.grid()
plt.show()

	Name	Sex	Count
0	Jessica	F	27938
1	Ashley	F	26602
2	Emily	F	24380
3	Samantha	F	21643
4	Sarah	F	21388

	State	Sex	Year	Name	Count
6122885	DE	M	2019	River	5
6122886	DE	M	2019	Rocco	5
6122887	DE	M	2019	Shane	5
6122888	DE	M	2019	Syncere	5
6122889	DE	M	2019	Yasir	5

	Age	M_death_prob	M_amount	M_LifeExpect	F_death_prob	F_amount	F_LifeExpect
90	90	0.163689	18303	4.08	0.129706	29685	4.85
91	91	0.181104	15307	3.78	0.144636	25835	4.50
92	92	0.199810	12535	3.50	0.160741	22098	4.18
93	93	0.219765	10030	3.25	0.177971	18546	3.88
94	94	0.240913	7826	3.03	0.196270	15245	3.61
95	95	0.261868	5941	2.83	0.214769	12253	3.37
96	96	0.282225	4385	2.66	0.233174	9622	3.16
97	97	0.301555	3147	2.51	0.251158	7378	2.96
98	98	0.319421	2198	2.37	0.268378	5525	2.79
99	99	0.335392	1496	2.25	0.284481	4042	2.63
100	100	0.352162	994	2.13	0.301550	2892	2.48
101	101	0.369770	644	2.02	0.319643	2020	2.33
102	102	0.388259	406	1.91	0.338821	1374	2.19
103	103	0.407672	248	1.81	0.359151	909	2.06
104	104	0.428055	147	1.71	0.380700	582	1.93
105	105	0.449458	84	1.61	0.403542	361	1.81
106	106	0.471931	46	1.52	0.427754	215	1.69
107	107	0.495527	24	1.43	0.453420	123	1.58
108	108	0.520304	12	1.35	0.480625	67	1.47
109	109	0.546319	6	1.27	0.509462	35	1.37
110	110	0.573635	3	1.19	0.540030	17	1.27
111	111	0.602317	1	1.11	0.572432	8	1.18
112	112	0.632432	0	1.04	0.606778	3	1.09
113	113	0.664054	0	0.97	0.643184	1	1.01
114	114	0.697257	0	0.91	0.681775	0	0.93
115	115	0.732119	0	0.84	0.722682	0	0.86
116	116	0.768725	0	0.78	0.766043	0	0.79
117	117	0.807162	0	0.73	0.807162	0	0.73
118	118	0.847520	0	0.67	0.847520	0	0.67
119	119	0.889896	0	0.62	0.889896	0	0.62

MAT411: Bayesian Data Analysis¶

Week 14¶

Names and Dates¶

Life expectancy table¶

	State	Sex	Year	Name	Count
0	MO	F	1910	Mary	611
1	MO	F	1910	Helen	313
2	MO	F	1910	Dorothy	270
3	MO	F	1910	Mildred	267
4	MO	F	1910	Ruth	237

	State	Sex	Year	Name	Count
1637340	WI	M	1989	Sue	5
3651429	MN	M	1988	Sue	5
4771445	CA	M	1983	Sue	5
4774849	CA	M	1985	Sue	7
4779197	CA	M	1987	Sue	6
4781400	CA	M	1988	Sue	6
4786621	CA	M	1990	Sue	5
4788574	CA	M	1991	Sue	7
4793665	CA	M	1993	Sue	7
4801811	CA	M	1996	Sue	5