# delimit; *********************************************************************************** this is the stata file that makes the answers for problem set 2 february 18, 2014 february 19, 2014 february 20, 2014 february 21, 2014 march 16, 2014 march 19, 2014 march 24, 2014 ps2v07.do ***********************************************************************************; /* *********************************************************************************** ****** 1. hazard model: nlsy ****************************************************** ***********************************************************************************; ******* A. get ready **************************************************************; *** A.1. start up things ***; clear all; pause on; set more off; * set todays date; adopath ++ /home/lfbrooks/home/bleah/ado; dateo; *** A.2. load data ***; use /groups/brooksgrp/national_longitudinal_survey_youth/pppa6022_2014spring/nlsy_to_stata_20140220; * older version of data *; *saveold /groups/brooksgrp/national_longitudinal_survey_youth/pppa6022_2014spring/nlsy_to_stata_20140220_old; ***** B. set up for hazard model **************************************************; *** B.1. clean up data ***; * re-code urban-rural to be missing if eq 2 *; replace urban_rural = . if urban_rural == 2; tab urban_rural; * get rid of some odd coding with weight *; replace weight = . if weight == 996; summ weight, detail; * make male dummy variable *; gen male=0; replace male=1 if gender == 1; * make a time that starts at zero *; tab year; gen cox_time = year - 1978; tab cox_time; gen any_kids = 0; * need the non-missing part, otherwise things get screwed up *; replace any_kids = 1 if kids >= 1 & kids <= 11; tab any_kids; tab year any_kids; * make a marker for being the last year of no kids *; gen no_kids_death = 0; sort id; egen last_obs = min(year) if any_kids==1, by(id); egen ever_kids = max(any_kids), by(id); replace no_kids_death = 1 if year == last_obs & any_kids == 1; tab year no_kids_death; tab year ever_kids; tab no_kids_death; *** B.1. find when kids appear, and drop after ***; * drop if its after kids *; * drop if ever_kids == 1 & year > last_obs; *** B.2. set up for hazard model ***; * lets stata know this is hazard model data *; stset cox_time, failure(no_kids_death) id(id); ****** C. summary stats *************************************************************; * 1(a) *; * share of 1979 population that ever has kids *; summ ever_kids if year == 1979; * note that you could get the wrong share (12%) by doing this *; summ kids if year == 1979; * share of 1979 poulation that has kids *; summ any_kids if year == 1979; * share that have kids by 1990 *; summ any_kids if year == 1990; * share with no kids in 2000, with kids in 2002 *; gen k2000 = any_kids if year == 2000; egen any_kids_2000 = max(k2000), by(id); tab any_kids_2000; summ any_kids if year == 2002 & any_kids_2000 == 0; * of those with no kids in 2000, but with kids in 2002, what's the gender breakdown? *; tab gender if any_kids == 1 & any_kids_2000 == 0 & year ==2002; * ever kids by urban_rural *; sort urban_rural; by urban_rural: summ ever_kids if year == 1979; ****** D. pictures ******************************************************************; * overall survival function *; sts graph, xsize(11) ysize(8.5) xtitle("years from 1978") ytitle("share with no kids"); graph export "/home/lfbrooks/home/bleah/pppa6022/2014/statafig/problem_set_2/${date}_survival_function_overall.eps", replace; * survival function by urban/rural *; sts graph, by(urban_rural) xsize(11) ysize(8.5) legend(label(1 "Rural") label(2 "Urban")) xtitle("years from 1978") ytitle("share with no kids"); graph export "/home/lfbrooks/home/bleah/pppa6022/2014/statafig/problem_set_2/${date}_survival_function_urban_rural.eps", replace; ****** E. regressions ***************************************************************; stcox weight urban_rural male; eststo e1; * output results *; estout * using "/home/lfbrooks/home/bleah/pppa6022/2014/stataout/problem_set_2/${date}_cox_reg.txt", replace eform varwidth(12) varlabels(_cons Constant) cells(b(star fmt(%12.3f)) se(par fmt(%12.4f))) stats(r2 N, fmt(%9.3f %9.0g %9.3f) labels("R-squared" "Obs")); */ *********************************************************************************** ****** 2. ipums-census data ******************************************************* ***********************************************************************************; ****** A. prep stuff **********************************************************; clear all; pause on; set more off; * set todays date; adopath ++ /home/lfbrooks/home/bleah/ado; dateo; ****** B. load data ***********************************************************; * switch for which sample we use *; *local sample big; local sample small; * load the big sample *; if "`sample'" == "big" {; * load cps data, keeping only variables that are of interest *; use /groups/brooksgrp/census/1980census/ipums/pppa6022_spring2014/c1980_ipums_20140220; * take a smaller random sample *; *gen double rand_samp = runiform(); *keep if rand_samp > 0.9; * save this as a junk dataset to load *; *save /lustre/groups/brooksgrp/ipumscen80, replace; }; if "`sample'" == "small" {; * bring in the smaller data *; use /lustre/groups/brooksgrp/ipumscen80; }; * make old version *; *saveold /lustre/groups/brooksgrp/ipumscen80_old; ****** C. set up data *********************************************************; ** keep as in A and K **; drop if birthyr > 1959; drop if birthyr < 1930; * drop women *; drop if sex == 2; tab educ; tab educd; * make birth decade markers *; gen b_decade = 0; replace b_decade=1930 if birthyr >= 1930 & birthyr < 1940; replace b_decade=1940 if birthyr >= 1940 & birthyr < 1950; * make in smsa marker *; gen in_smsa = 0; replace in_smsa = 1 if metro == 2| metro == 3| metro == 4; * make marital status indicator *; gen married = 0; replace married = 1 if marst == 1| marst == 2; * make quarterly dummies to match a & k's table *; tab birthq, gen(bqdum); * make birth year dummies for 1930s *; tab birthyr if b_decade == 1930, gen(bydum); * make log of wages *; * this is going to effectively drop people with zero wages *; * this is quite a few observations *; gen ln_incwage = ln(incwage); * make interaction of quarter*birth year *; local eqdum ""; forvalues q=1/3 {; forvalues y=1/9 {; gen bdum_q`q'y`y' = bqdum`q'*bydum`y'; local eqdum "`eqdum' bdum_q`q'y`y' ="; }; }; * make age squared *; gen age2 = age*age; ** make education into a continuous variable **; gen yrs_educ = 0; replace yrs_educ = 0 if educ == 0; replace yrs_educ = 4 if educ == 1; replace yrs_educ = 8 if educ == 2; replace yrs_educ = 9 if educ == 3; replace yrs_educ = 10 if educ == 4; replace yrs_educ = 11 if educ == 5; replace yrs_educ = 12 if educ == 6; replace yrs_educ = 13 if educ == 7; replace yrs_educ = 14 if educ == 8; replace yrs_educ = 15 if educ == 9; replace yrs_educ = 16 if educ == 10; replace yrs_educ = 17 if educ == 11; ** other sample limits **; gen reg_keep = 1; replace reg_keep = 0 if incwage < 0; * keep only positive weeks worked and positive wage and salary -- but i didnt get weeks worked *; ****** D. Table 1, first two rows ***************************************************; sort b_decade; by b_decade: summ yrs_educ; regress yrs_edu bqdum1-bqdum3 if b_decade == 1930; test bqdum1=bqdum2=bqdum3=0; estadd scalar f_qs_eq = r(F); estadd scalar p_qs_eq = r(p); estimates store t1a; regress yrs_edu bqdum1-bqdum3 if b_decade == 1940; test bqdum1=bqdum2=bqdum3=0; estadd scalar f_qs_eq = r(F); estadd scalar p_qs_eq = r(p); estimates store t1b; ****** E. first stage ***************************************************************; * set covariates *; local covs "in_smsa married i.region age age2"; * quarters only *; xi: regress yrs_educ `covs' if b_decade == 1930 & ln_incwage != .; estimates store fs1nbq; xi: regress yrs_educ `covs' bqdum1-bqdum3 if b_decade == 1930 & ln_incwage != .; test bqdum1=bqdum2=bqdum3; estadd scalar f_qs_eq = r(F); estadd scalar p_qs_eq = r(p); predict xhat1, xb; estimates store fs1; * quarters * year of birth *; xi: regress yrs_educ `covs' if b_decade == 1930 & ln_incwage != .; estimates store fs2nbq; xi: regress yrs_educ `covs' bdum_q*y* if b_decade == 1930 & ln_incwage != .; predict xhat2, xb; test `eqdum'=0; estadd scalar f_qs_eq = r(F); estadd scalar p_qs_eq = r(p); estimates store fs2; ****** F. second stage **************************************************************; * just quarters as instruments *; regress ln_incwage `covs' xhat1 if b_decade == 1930 & ln_incwage != .; estimates store ss1a; ivregress 2sls ln_incwage `covs' (yrs_educ=bqdum1-bqdum3) if b_decade == 1930 & ln_incwage != .; estimates store ss1b; * quarters*years of birth *; regress ln_incwage `covs' xhat1 if b_decade == 1930 & ln_incwage != .; estimates store ss2a; ivregress 2sls ln_incwage `covs' (yrs_educ=bqdum1-bqdum3) if b_decade == 1930 & ln_incwage != .; estimates store ss2b; ****** G. output results **************************************************************; estout * using "/home/lfbrooks/home/bleah/pppa6022/2014/stataout/problem_set_2/${date}_prob2_ivregs.txt", replace varwidth(12) varlabels(_cons Constant) cells(b(star fmt(%12.3f)) se(par fmt(%12.3f))) stats(r2 N f_qs_eq p_qs_eq, fmt(%9.3f %9.0g %9.3f %9.3f) labels("R-squared" "Observations" "F test: instruments" "p-value of F test")); estimates clear;