# delimit;

***********************************************************************************

this is the stata file that makes the answers for problem set 2

february 18, 2014 
february 19, 2014
february 20, 2014
february 21, 2014
march 16, 2014
march 19, 2014
march 24, 2014

ps2v07.do

***********************************************************************************;

/*
***********************************************************************************

****** 1. hazard model: nlsy ******************************************************

***********************************************************************************;


******* A. get ready **************************************************************;

*** A.1. start up things ***;

clear all;
pause on;
set more off;

* set todays date;
adopath ++ /home/lfbrooks/home/bleah/ado;
dateo;

*** A.2. load data ***;

use /groups/brooksgrp/national_longitudinal_survey_youth/pppa6022_2014spring/nlsy_to_stata_20140220;

* older version of data *;
*saveold /groups/brooksgrp/national_longitudinal_survey_youth/pppa6022_2014spring/nlsy_to_stata_20140220_old;


***** B. set up for hazard model **************************************************;

*** B.1. clean up data ***;

* re-code urban-rural to be missing if eq 2 *;
replace urban_rural = . if urban_rural == 2;
tab urban_rural;

* get rid of some odd coding with weight *;
replace weight = . if weight == 996;
summ weight, detail;

* make male dummy variable *;
gen male=0;
replace male=1 if gender == 1;

* make a time that starts at zero *;
tab year;
gen cox_time = year - 1978;
tab cox_time;

gen any_kids = 0;
* need the non-missing part, otherwise things get screwed up *;
replace any_kids = 1 if kids >= 1 & kids <= 11;
tab any_kids;
tab year any_kids;

* make a marker for being the last year of no kids *;
gen no_kids_death = 0;
sort id;
egen last_obs = min(year) if any_kids==1, by(id);
egen ever_kids = max(any_kids), by(id);
replace no_kids_death = 1 if year == last_obs & any_kids == 1;
tab year no_kids_death;
tab year ever_kids;
tab no_kids_death;

*** B.1. find when kids appear, and drop after ***;

* drop if its after kids *;
* drop if ever_kids == 1 & year > last_obs;

*** B.2. set up for hazard model ***;

* lets stata know this is hazard model data *;
stset cox_time, failure(no_kids_death) id(id);

****** C. summary stats *************************************************************;

* 1(a) *;
* share of 1979 population that ever has kids *;
summ ever_kids if year == 1979;

* note that you could get the wrong share (12%) by doing this *;
summ kids if year == 1979;

* share of 1979 poulation that has kids *;
summ any_kids if year == 1979;

* share that have kids by 1990 *;
summ any_kids if year == 1990;

* share with no kids in 2000, with kids in 2002 *;
gen k2000 = any_kids if year == 2000;
egen any_kids_2000 = max(k2000), by(id);
tab any_kids_2000;
summ any_kids if year == 2002 & any_kids_2000 == 0;

* of those with no kids in 2000, but with kids in 2002, what's the gender breakdown? *;
tab gender if any_kids == 1 & any_kids_2000 == 0 & year ==2002;

* ever kids by urban_rural *;
sort urban_rural;
by urban_rural: summ ever_kids if year == 1979;

****** D. pictures ******************************************************************;

* overall survival function *;
sts graph, 
  xsize(11) ysize(8.5)
  xtitle("years from 1978")
  ytitle("share with no kids");
graph export "/home/lfbrooks/home/bleah/pppa6022/2014/statafig/problem_set_2/${date}_survival_function_overall.eps", replace;

* survival function by urban/rural *;
sts graph, by(urban_rural)
  xsize(11) ysize(8.5)
  legend(label(1 "Rural") label(2 "Urban"))
  xtitle("years from 1978")
  ytitle("share with no kids");
graph export "/home/lfbrooks/home/bleah/pppa6022/2014/statafig/problem_set_2/${date}_survival_function_urban_rural.eps", replace;


****** E. regressions ***************************************************************;

stcox weight urban_rural male;
eststo e1;

* output results *;
estout *
  using "/home/lfbrooks/home/bleah/pppa6022/2014/stataout/problem_set_2/${date}_cox_reg.txt",
  replace
  eform
  varwidth(12) varlabels(_cons Constant)
  cells(b(star fmt(%12.3f)) se(par fmt(%12.4f)))
  stats(r2 N, fmt(%9.3f %9.0g %9.3f) labels("R-squared" "Obs"));
*/


***********************************************************************************

****** 2. ipums-census data *******************************************************

***********************************************************************************;

****** A. prep stuff **********************************************************;

clear all;
pause on;
set more off;

* set todays date;
adopath ++ /home/lfbrooks/home/bleah/ado;
dateo;


****** B. load data ***********************************************************;

* switch for which sample we use *;
*local sample big;
local sample small;

* load the big sample *;
if "`sample'" == "big"
  {;

  * load cps data, keeping only variables that are of interest *;
  use /groups/brooksgrp/census/1980census/ipums/pppa6022_spring2014/c1980_ipums_20140220;

  * take a smaller random sample *;
  *gen double rand_samp = runiform();
  *keep if rand_samp > 0.9;

  * save this as a junk dataset to load *;
  *save /lustre/groups/brooksgrp/ipumscen80, replace;
  };

if "`sample'" == "small"
  {;
  * bring in the smaller data *;
  use /lustre/groups/brooksgrp/ipumscen80;
  };

* make old version *;
*saveold /lustre/groups/brooksgrp/ipumscen80_old;


****** C. set up data *********************************************************;

** keep as in A and K **;
drop if birthyr > 1959;
drop if birthyr < 1930;
* drop women *;
drop if sex == 2;

tab educ;
tab educd;

* make birth decade markers *;
gen b_decade = 0;
replace b_decade=1930 if birthyr >= 1930 & birthyr < 1940;
replace b_decade=1940 if birthyr >= 1940 & birthyr < 1950;

* make in smsa marker *;
gen in_smsa = 0;
replace in_smsa = 1 if metro == 2| metro == 3| metro == 4;

* make marital status indicator *;
gen married = 0;
replace married = 1 if marst == 1| marst == 2;

* make quarterly dummies to match a & k's table *;
tab birthq, gen(bqdum);

* make birth year dummies for 1930s *;
tab birthyr if b_decade == 1930, gen(bydum);

* make log of wages *;
* this is going to effectively drop people with zero wages *;
* this is quite a few observations *;
gen ln_incwage = ln(incwage);

* make interaction of quarter*birth year *;
local eqdum "";
forvalues q=1/3
  {;
  forvalues y=1/9
    {;
    gen bdum_q`q'y`y' = bqdum`q'*bydum`y';
    local eqdum "`eqdum' bdum_q`q'y`y' =";
    };
  };

* make age squared *;
gen age2 = age*age;

** make education into a continuous variable **;
gen yrs_educ = 0;
replace yrs_educ = 0 if educ == 0;
replace yrs_educ = 4 if educ == 1;
replace yrs_educ = 8 if educ == 2;
replace yrs_educ = 9 if educ == 3;
replace yrs_educ = 10 if educ == 4;
replace yrs_educ = 11 if educ == 5;
replace yrs_educ = 12 if educ == 6;
replace yrs_educ = 13 if educ == 7;
replace yrs_educ = 14 if educ == 8;
replace yrs_educ = 15 if educ == 9;
replace yrs_educ = 16 if educ == 10;
replace yrs_educ = 17 if educ == 11;

** other sample limits **;
gen reg_keep = 1;
replace reg_keep = 0 if incwage < 0;
* keep only positive weeks worked and positive wage and salary -- but i didnt get weeks worked *;

****** D. Table 1, first two rows ***************************************************;

sort b_decade;
by b_decade: summ yrs_educ;

regress yrs_edu bqdum1-bqdum3 if b_decade == 1930;
test bqdum1=bqdum2=bqdum3=0;
estadd scalar f_qs_eq = r(F);
estadd scalar p_qs_eq = r(p);
estimates store t1a;

regress yrs_edu bqdum1-bqdum3 if b_decade == 1940;
test bqdum1=bqdum2=bqdum3=0;
estadd scalar f_qs_eq = r(F);
estadd scalar p_qs_eq = r(p);
estimates store t1b;


****** E. first stage ***************************************************************;

* set covariates *;
local covs "in_smsa married i.region age age2";

* quarters only *;
xi: regress yrs_educ `covs' if b_decade == 1930 & ln_incwage != .;
estimates store fs1nbq;
xi: regress yrs_educ `covs' bqdum1-bqdum3 if b_decade == 1930 & ln_incwage != .;
test bqdum1=bqdum2=bqdum3;
estadd scalar f_qs_eq = r(F);
estadd scalar p_qs_eq = r(p);
predict xhat1, xb;
estimates store fs1;

* quarters * year of birth *;
xi: regress yrs_educ `covs' if b_decade == 1930 & ln_incwage != .;
estimates store fs2nbq;
xi: regress yrs_educ `covs' bdum_q*y* if b_decade == 1930 & ln_incwage != .;
predict xhat2, xb;
test `eqdum'=0;
estadd scalar f_qs_eq = r(F);
estadd scalar p_qs_eq = r(p);
estimates store fs2;


****** F. second stage **************************************************************;

* just quarters as instruments *;
regress ln_incwage `covs' xhat1 if b_decade == 1930 & ln_incwage != .;
estimates store ss1a;
ivregress 2sls ln_incwage `covs' (yrs_educ=bqdum1-bqdum3) if b_decade == 1930 & ln_incwage != .;
estimates store ss1b;

* quarters*years of birth *;
regress ln_incwage `covs' xhat1 if b_decade == 1930 & ln_incwage != .;
estimates store ss2a;
ivregress 2sls ln_incwage `covs' (yrs_educ=bqdum1-bqdum3) if b_decade == 1930 & ln_incwage != .;
estimates store ss2b;

****** G. output results **************************************************************;

estout *
  using "/home/lfbrooks/home/bleah/pppa6022/2014/stataout/problem_set_2/${date}_prob2_ivregs.txt",
  replace
  varwidth(12) varlabels(_cons Constant)
  cells(b(star fmt(%12.3f)) se(par fmt(%12.3f)))
  stats(r2 N f_qs_eq p_qs_eq, fmt(%9.3f %9.0g %9.3f %9.3f) labels("R-squared" "Observations" "F test: instruments" "p-value of F test"));

estimates clear;