# delimit; *********************************************************************************** this is the stata file that makes the answers for problem set 1 january 28, 2014 january 30, 2014 january 31, 2014 february 6, 2014 february 7, 2014 february 10, 2014 ** fixing prime age male to be males february 18, 2014 ** fixing inflation correction february 12, 2015 ** adding group average part ps1_2015v01.do ***********************************************************************************; *********************************************************************************** ****** 1. ipums-usa data ********************************************************** ***********************************************************************************; ******* A. get ready **************************************************************; *** A.1. start up things ***; clear all; pause on; set more off; * set todays date; adopath ++ /home/lfbrooks/home/bleah/ado; dateo; *** A.2. load data ***; * switch for which sample we use *; *local sample big; local sample small; * load the big sample *; if "`sample'" == "big" {; * 1950 *; * program that creates these data is /home/lfbrooks/home/bleah/pppa6022/2014/stataprg/problem_set_1/load1950v01.do; use /groups/brooksgrp/pppa6022/2014/1950_census_data/20140128_ipums_1950; * 2010 *; * program that creates these data is /home/lfbrooks/home/bleah/pppa6022/2014/stataprg/problem_set_1/load2010v01.do; append using /groups/brooksgrp/pppa6022/2014/2010_census_data/20140128_ipums_2010; * code making the smaller sample *; * take a smaller random sample *; *gen double rand_samp = runiform(); *keep if rand_samp > 0.9; * save this as a junk dataset to load *; *save /groups/brooksgrp/pppa6022/2015/small_census/ipumscen, replace; }; * load the small sample *; if "`sample'" == "small" {; use /groups/brooksgrp/pppa6022/2015/small_census/ipumscen; }; ***** B. 1(a) mean male wages *************************************************; *** B.1. marker for prime-age males ***; gen prime_age_male = 0; replace prime_age_male = 1 if (age >= 25 & age < 65) & sex == 1; *** B.2. re-code top coded wages ***; replace incwage = . if year == 1950 & incwage == 999999; replace incwage = . if year == 2010 & incwage == 999999; *** B.3. find average wages by year ***; /* * saves the data for later use *; preserve; * make a small table with mean and sd of mean of wage *; sort year prime_age_male; collapse (mean) incwage=incwage (sem) incwage_se=incwage (count) inc_obs=incwage [aweight=perwt], by(year prime_age_male); * keep only obs of interest *; keep if prime_age_male == 1; * make marker for wage type *; gen wage_type = "nominal"; list; * save it *; save /home/lfbrooks/pppa6022/2015/stataout/problem_set_1/tempavwages_`sample', replace; ****** C. 1(b): on weighting *******************************************; restore; */ * keep only non-missing wage obs *; keep if incwage ! = .; * keep only prime age men *; keep if prime_age_male == 1; * do original wtd average w/o stata weights *; sort year; egen year_obs = count(incwage), by(year); egen perwt_sum = sum(perwt), by(year); gen perwt_wt = perwt/perwt_sum; gen perwt_wt_byn = perwt_wt * year_obs; gen incwage_bit = perwt_wt_byn * incwage; by year: summ incwage_bit; * original weighted average *; sort year; by year: summ incwage [aweight=perwt]; * group weighted averages by stata: wages by education and year *; preserve; sort year educ; collapse (mean) incwage=incwage (count) n_educ=incwage [aweight=perwt], by(year educ); save /home/lfbrooks/pppa6022/2015/stataout/problem_set_1/group_mn_stata, replace; list; * group weighted averages by me *; restore; sort year educ; collapse (mean) incwage_bit (count) n_educ=incwage, by(year educ); save /home/lfbrooks/pppa6022/2015/stataout/problem_set_1/group_mn_me, replace; list; * final average with my weights *; egen year_obs = sum(n_educ), by(year); gen grp_share = n_educ/year_obs; summ grp_share; gen av_input = grp_share*incwage; egen totav = sum(av_input), by(year); list; ***** D. make wages real *******************************************************; *** D.1. get back to big dataset and find cpi stuff ***; restore; * values from the BLS are as follows: 24.98 for 1950 220.437 for 2010 234.594 for 2013; * re-scale so 2013 is 1: 0.1065/ for 1950 0.9396 f/or 2010 1 for 2013 ; * this means that $0.03 in 1950 is $1 in 2013 ; * so how much is $1 in 1950 in 2013? 0.1065 / 1 = 1 / x x = 1 / 0.1065 = 9.39 ; * fix wages *; gen rl_wage = incwage * 9.39 if year == 1950; replace rl_wage = incwage * 1.067 if year == 2010; *** D.2. 1(c): re-do the average wages by year for real wages ***; preserve; sort year prime_age_male; collapse (mean) incwage=rl_wage (sem) incwage_se=rl_wage (count) inc_obs=incwage [aweight=perwt], by(year prime_age_male); * keep only obs of interest *; keep if prime_age_male == 1; * make marker for wage type *; gen wage_type = "real"; *** D.3. make a table ***; * add the nominal wage dataset from b *; append using /lustre/groups/brooksgrp/tempavwages; * make the dataset wide so I can do t-tests *; reshape wide incwage incwage_se inc_obs, i(wage_type) j(year); * make the t values *; gen t = abs((incwage2010-incwage1950)/(incwage_se2010+incwage_se1950)); * output the dataset *; outsheet using /home/lfbrooks/pppa6022/2014/stataout/problem_set_1/${date}_prob1_ttests_`sample'.txt, replace; list; ***** E. 1(d) and 1(e) husband vs wife wages ****************************************; *** E.1. set up data for regressions ***; restore; * subset to married couples *; * keep if married, spouse present *; keep if marst == 1; * keep if head of household or spouse *; keep if relate == 1 | relate == 2; * make male dummy *; gen male = 0; replace male = 1 if sex == 1; * make 2010 dummy *; gen d2010 = 0; replace d2010 = 1 if year == 2010; * make 2010*male *; gen d2010_male = d2010*male; * keep only 65 or younger *; keep if age <= 65 & age >= 25; * make age squared, cubed, to the fourth power *; gen age2 = age*age; gen age3 = age2*age; gen age4 = age3*age; * make a hh serial number by year (thanks, Drew! for catching this) *; sort year; by year: summ serial; gen serialyr = 1950*1000000+serial if year == 1950; replace serialyr = 2010*1000000+serial if year == 2010; * check this sample *; summ age male incwage; *** E.2. regressions ***; *** 1(d) ***; * estimate wages as a function of age, year, and being the husband *; regress rl_wage age male d2010 [aweight=perwt]; eststo c1; * re-estimate with a variety of sensible covariates *; xi: regress rl_wage age male d2010 i.educ i.race i.metro [aweight=perwt]; eststo c2; * re-estimate with covariates and family fixed effects *; xi: areg rl_wage age male i.educ i.race i.metro [aweight=perwt], absorb(serialyr); eststo c3; * re-estimate, allowing the main effect to vary *; xi: areg rl_wage age male d2010_male i.educ i.race i.metro [aweight=perwt], absorb(serialyr); eststo c4; *** 1(e): different treatments of age ***; * parametric non-linear effects of age *; xi: areg rl_wage age age2 age3 age4 male d2010 d2010_male i.educ i.race i.metro [aweight=perwt], absorb(serialyr); eststo c5; * non-parametric non-linear effects of age *; xi: areg rl_wage i.age male d2010 d2010_male i.educ i.race i.metro [aweight=perwt], absorb(serialyr); eststo c6; *** output results ***; estout * using "/home/lfbrooks/home/bleah/pppa6022/2014/stataout/problem_set_1/${date}_prob1_regs_`sample'.txt", replace varwidth(12) varlabels(_cons Constant) cells(b(star fmt(%12.1f)) se(par fmt(%12.1f))) stats(r2 N, fmt(%9.3f %9.0g %9.3f) labels("R-squared" "Obs")); *********************************************************************************** ****** 2. ipums-cps data ********************************************************** ***********************************************************************************; ****** A. prep stuff **********************************************************; /* clear all; * set todays date; adopath ++ /home/lfbrooks/home/bleah/ado; dateo; ****** B. load data ***********************************************************; * switch for which sample we use *; *local sample big; local sample small; * load the big sample *; if "`sample'" == "big" {; * load cps data, keeping only variables that are of interest *; use /groups/brooksgrp/containerization/current_population_survey/input_201401/20140107_cpsinput; * take a smaller random sample *; *gen double rand_samp = runiform(); *keep if rand_samp > 0.9; * save this as a junk dataset to load *; *save /lustre/groups/brooksgrp/ipumscps, replace; }; if "`sample'" == "small" {; * bring in the smaller data *; use /lustre/groups/brooksgrp/ipumscps; }; ****** C. set up data *********************************************************; ** fix topcodes **; * not in universe *; replace incwage = . if incwage == 9999999; * missing *; replace incwage = . if incwage == 9999998; * top-coded -- starts in 2011.. i think at 1,000,000 *; replace incwage = 1000000 if incwage == 9999997; * marker for employed guys *; gen emp = 0; replace emp = 1 if empstat == 10 | empstat == 12 | empstat ==13; ** make fake treatment **; * make a list of all treated states *; local trts 26 6 4 35 27 39 51 21 54 29 28 13 19 33 25 23; * make a treatment variable *; gen treat=0; * fix it for the affected states *; foreach j in `trts' {; replace treat=1 if statefip == `j'; label variable treat "1 if ever in a treated state"; }; * make a linear trend *; gen trend = year - 1979; * make an after variable *; gen after=0; replace after=1 if year >= 2000; * make a trend*treatment interaction *; gen trend_treat = 0; replace trend_treat = trend * treat * after; gen trend_treat_before = 0; replace trend_treat_before = trend if treat == 1; * make a treated*after variable *; gen treat_after = treat*after; label variable treat_after "1 if after the treatment in a treated state"; * make a treated*after*male variable *; gen male = 0; replace male = 1 if sex==1; label variable male "1 if male (from sex)"; gen treat_after_male = treat*after*male; label variable treat_after_male "1 if after the treatment in a treated state and obs is male"; * make other interactions that we need *; gen male_after = male*after; gen male_treat = male*treat; ****** C. summary tables ******************************************************; *** C.1. diff-in-diff ***; preserve; sort treat after; collapse (mean) incwage (sem) incwage_sd = incwage (count) incwage_obs=incwage if emp == 1, by(treat after); * create the variance of income *; gen incwage_var = incwage_sd*incwage_sd; * output to excel *; outsheet using /home/lfbrooks/home/bleah/pppa6022/2014/stataout/problem_set_1/${date}_prob2c_ttests_`sample'.txt, replace; *** C.2. triple diff ***; restore; preserve; sort treat after male; collapse (mean) incwage (sem) incwage_sd = incwage (count) incwage_obs=incwage if emp == 1, by(treat after male); * create the variance of income *; gen incwage_var = incwage_sd*incwage_sd; * output to excel *; outsheet using /home/lfbrooks/home/bleah/pppa6022/2014/stataout/problem_set_1/${date}_prob2d_ttests_`sample'.txt, replace; ****** D. pictures ************************************************************; * keep the data around for later *; restore; preserve; * command for making graphs i like *; graph set eps orientation landscape; *** D.1. set up data for the picture ***; * get annual mean wages for treated and untreated states, and also the std error *; sort year treat; collapse (mean) incwage (sem) incwage_sd = incwage if emp == 1 & year < 2000, by(year treat); * reshape to make it wider *; reshape wide incwage incwage_sd, i(year) j(treat); * calculate confidence bands *; foreach j in 0 1 {; gen cihi`j' = incwage`j' + 1.96*incwage_sd`j'; label variable cihi`j' "upper confidence bound for incwage when treatment = `j'"; gen cilo`j' = incwage`j' - 1.96*incwage_sd`j'; label variable cilo`j' "lower confidence bound for incwage when treatment = `j'"; }; * check this *; list; *** D.2. make the average wage over time picture ***; graph twoway (rarea cilo1 cihi1 year, bcolor(gs14)) (rarea cilo0 cihi0 year, bcolor(gs14)) (connected incwage0 year, msymbol(o)) (connected incwage1 year, msymbol(o)), ytitle("annual wage") legend(off) xsize(11) ysize(8.5); graph export "/home/lfbrooks/home/bleah/pppa6022/2014/statafig/problem_set_1/${date}_mean_wage_wandwo_treatment_`sample'.eps", replace; ****** E. regressions *********************************************************; * bring back the data *; restore; *** E.1. test the parallel pre-trend assumption with a regression (2(a)) ***; * could have a linear trend, and an interaction term for the treated guys *; xi: regress incwage trend trend_treat_before i.age i.race i.statefip if emp == 1 & year < 2000; eststo t1; * could test whether year effects are equal before *; xi: regress incwage i.year*treat i.race i.age i.statefip if emp == 1 & year < 2000; local testvals _IyeaXtre_1963; forvalues y=1964/1999 {; local testvals `testvals' = _IyeaXtre_`y'; }; test `testvals' = 0; eststo t2; *** E.2. diff-in-diff regression *****; xi: regress incwage treat_after i.age i.race i.statefip i.year if emp == 1; eststo r1; *** E.3. diff-in-diff-in-diff regression ***; xi: regress incwage treat_after_male treat_after male_after male_treat i.age i.race i.statefip i.year if emp == 1; eststo r2; *** E.4. shrink T to 2, re-do regression from (c) ***; * make a state/year level dataset *; * omit years before 1976, b/c some of them have funny states *; sort after statefip; collapse incwage treat treat_after if emp==1 & year > 1976, by(after statefip); xi: areg incwage treat after treat_after, absorb(statefip); eststo r3; *** E.5. output results ***; estout * using "/home/lfbrooks/home/bleah/pppa6022/2014/stataout/problem_set_1/${date}_prob2_regs_`sample'.txt", replace varwidth(12) varlabels(_cons Constant) cells(b(star fmt(%12.1f)) se(par fmt(%12.1f))) stats(r2 N, fmt(%9.3f %9.0g %9.3f) labels("R-squared" "Observations")); */