--- title: "Career Outcomes of PhD Astronomers" output: html_document --- ```{r} # Define the confidence levels we will use in this work confidence = 0.95 plotconfidence = 0.67 # Set plotting defaults xlab = 'Years after PhD' mar=c(5.1,4.1,1.1,2.1) # Load the custom analysis/plotting functions setwd("~/AstroPhDs") source('astrophds_func.r') ``` #### Load data; create data frames for different subsets ```{r} setwd("~/AstroPhDs") allphd = read.csv('phdoutcomes.csv') ntotal = nrow(allphd) phd = allphd[allphd$Omit==FALSE,] # remove the omits omitted = allphd[allphd$Omit==TRUE,] # hold onto the omits here phd$Gender = droplevels(phd$Gender) # remove unknown-gender factor level since unknown-gender rows were omitted nunkyear = sum(is.na(phd$JobYear)) n = nrow(phd) # number excluding omits nomit = nrow(omitted) print(paste('Loaded',n,'PhDs (omitted',nomit,')')) TimeToJob = phd$JobYear-phd$PhDYear TimeToJob[TimeToJob < 0] = 0 phd = cbind(phd,TimeToJob) phdfac = phd[phd$Left == 'hired',] # permanent/long-term astro job phdleft = phd[phd$Left == 'left',] # left field phdtemp = phd[phd$Left == 'postdoc' | phd$Left == 'adjunct',] # postdoc/temp phdnotdoc = phd[phd$Left != 'postdoc' & phd$Left != 'adjunct',] # all jobs (astro+nonastro) phdnotdoc$Left = droplevels(phdnotdoc$Left) phdy = phd[is.na(phd$JobYear)==FALSE,] # same as above four, but omitting unknown-year individuals phdfacy = phdfac[is.na(phdfac$JobYear)==FALSE,] phdlefty = phdleft[is.na(phdleft$JobYear)==FALSE,] phdnotdocy = phdnotdoc[is.na(phdnotdoc$JobYear)==FALSE,] # safety step to make sure we don't accidentally use missing-year data for timing analysis phdfac = subset(phdfac, select = -c(TimeToJob)) phdleft = subset(phdleft, select = -c(TimeToJob)) phdnotdoc = subset(phdnotdoc, select = -c(TimeToJob)) ny = nrow(phdy) ``` #### Outcome differences by gender ```{r} otable1 = table(phdnotdoc$Gender, phdnotdoc$JobType) ch1 = chisq.test(otable1) print('') z = signif(ch1$stdres[1,],3) otable1t = cbind(t(otable1),z) # negative z is fewer women than 'expected'; positive is more women otable1t = rbind(otable1t, TEMP=c(table(phdtemp$Gender),NA)) otable1t = rbind(otable1t, OMITTED=c(table(omitted$Gender)[2:3],NA)) pctf = round(100*otable1t[,1]/(otable1t[,1]+otable1t[,2])) pctfunc = round(50*(binomialci(otable1t[,1],otable1t[,1]+otable1t[,2])$upper- binomialci(otable1t[,1],otable1t[,1]+otable1t[,2])$lower)) otable1t = cbind(otable1t,pctf,pctfunc) otable1t[c(3,2,4,1,5,6),] paste(table(omitted$Gender)[1],'with unknown gender') sum(otable1t[1:5,1:2]) # check should = n sum(otable1t[6,1:2])+sum(table(omitted$Gender)[1]) # check should = nomit ``` ```{r} otable2 = table(phdnotdoc$Gender, phdnotdoc$InstClass) otable2 = otable2[,2:ncol(otable2)] # omit blank NA column (a little bit dangerous) ch2 = chisq.test(otable2) print('') z = signif(ch2$stdres[1,],3) otable2t = cbind(t(otable2),z) otable2t = otable2t[c(11,12,10,9,1,6,8,7, 5,2,3,4,13),] #otable2t allastro = colSums(otable2t[1:8,1:2]) allnonastro= colSums(otable2t[9:13,1:2]) otable2t = rbind(otable2t[1:8,],ASTRO=c(allastro,NA),otable2t[9:13,],NONASTRO=c(allnonastro,NA)) pct2f = round(100*otable2t[,1]/(otable2t[,1]+otable2t[,2])) pct2func = round(50*(binomialci(otable2t[,1],otable2t[,1]+otable2t[,2])$upper-binomialci(otable2t[,1],otable2t[,1]+otable2t[,2])$lower)) otable2t = cbind(otable2t,pct2f,pct2func) otable2t ``` #### Chi^2 for career outcome differences ```{r} ch1 # are basic outcomes different? ch2 # are detailed outcomes different? ``` #### Basic statistics on time to permanent job ```{r} ns = tapply(phdfacy$TimeToJob, phdfacy$Gender, length) means = tapply(phdfacy$TimeToJob, phdfacy$Gender, mean) sds = tapply(phdfacy$TimeToJob, phdfacy$Gender, sd) ses = sds/sqrt(ns) medians = tapply(phdfacy$TimeToJob, phdfacy$Gender, median) m = data.frame(N=ns,mean=means,SE=ses,median=medians) # create table of N, mean, median, SE m gap = means[2]-means[1] # simple-statistics M-F hiring-time gap measurement gap = unname(gap) gapunc = sqrt(ses[2]^2+ses[1]^2)*1.96 # assuming confidence is 0.95 gapunc = unname(gapunc) ``` #### Time Differences by Gender and JobType ```{r} meantimes1 = tapply(phdnotdocy$TimeToJob, phdnotdocy$InstClass, mean) meantimes = tapply(phdnotdocy$TimeToJob, list(phdnotdocy$Gender,phdnotdocy$InstClass), mean) setimes = tapply(phdnotdocy$TimeToJob, list(phdnotdocy$Gender,phdnotdocy$InstClass), sd)/sqrt(tapply(phdnotdocy$TimeToJob, list(phdnotdocy$Gender,phdnotdocy$InstClass), length)) gaptimes = meantimes[1,]-meantimes[2,] gapses = sqrt(setimes[1,]^2+setimes[2,]^2) # not a proper t-test, poor for small samples. dttable = t(meantimes) dttable = cbind(dttable, gap=gaptimes) dttable = cbind(dttable, z=gaptimes/gapses) dttable = dttable[c(12,13,11,10,9,8,2,7,3,4,5,6),] dttable ``` #### Histograms ```{r} par(mar=mar) jhist.hire = jointhist.mf(phdy, condition=(phdy$Left=='hired')) # joint-normalize hiring/leaving histograms jhist.left = jointhist.mf(phdy, condition=(phdy$Left=='left')) jhist.limit =jointhist.mf(phdy, condition=(phdy$Left!='hired' & phdy$Left!='left')) jhist.r1 =jointhist.mf(phdy, condition=(phdy$Left=='hired' & (phdy$JobType=='prof-R1' | phdy$JobType=='prof'))) jointhist.mf.plot(jhist.hire, xlab=xlab, ylab='Fraction starting job') jointhist.mf.plot(jhist.left, xlab=xlab, ylab='Fraction leaving field') ``` #### Some statistical tests comparing the postdoc-time distributions ```{r} h = phdnotdocy # Student t-test ttest = t.test(h$TimeToJob[h$Gender=='M'],h$TimeToJob[h$Gender=='F'], conf.level=confidence) ttest # Student t-test with null hypothesis of Flaherty 1.1 year gap measurement ttest.vsF18 = t.test(h$TimeToJob[h$Gender=='M']-1.1,h$TimeToJob[h$Gender=='F'], conf.level=confidence) ttest.vsF18 # Wilcoxon test wtest = wilcox.test(h$TimeToJob[h$Gender=='M'],h$TimeToJob[h$Gender=='F']) wtest # K-S test kstest = ks.test(h$TimeToJob[h$Gender=='M'],h$TimeToJob[h$Gender=='F']) kstest # Anderson-Darling test library(kSamples) adtest = ad.test(h$TimeToJob[h$Gender=='M'],h$TimeToJob[h$Gender=='F']) adtest ``` #### Statistics on leaving the field ```{r} # Basic 2x2 table leavetable = table(phdnotdoc$Gender,phdnotdoc$Left=='left') leavetable # Fisher test (are M/F different in leave/stay fractions?) fisher = fisher.test(leavetable) mbinom = binom.test(leavetable['M',2], apply(leavetable,1,sum)[2], conf.level=confidence) fbinom = binom.test(leavetable['F',2], apply(leavetable,1,sum)[1], conf.level=confidence) fisher ``` #### Survival and hazard analysis for astronomy hiring ```{r} par(mar=mar) library(survival) S = Surv(phdy$TimeToJob, phdy$Left=='hired') survkm = survfit(S~phdy$Gender, conf.int=plotconfidence) coxfit = coxph(S~phdy$Gender) sumcoxfit = summary(coxfit) coxhire = cox.hazard.mf(coxfit, phdy$Gender) hazhire = km.hazard.mf(survkm, conf.level=plotconfidence) sumcoxfit km.plot(survkm, xlab=xlab, ylab='K-M survivorship for astrophysics hiring', xlim=c(0,11)) hazard.mf.plot(coxhire, hazhire, xlab=xlab, ylab='Probability of being hired (if postdoc)') SR1 = Surv(phdy$TimeToJob, phdy$Left=='hired' & (phdy$JobType=='prof-R1' | phdy$JobType=='prof')) coxfitR1 = coxph(SR1~phdy$Gender) coxR1 = cox.hazard.mf(coxfitR1, phdy$Gender) ``` #### Survival and hazard analysis for leaving astronomy ```{r} par(mar=mar) SI = Surv(phdy$TimeToJob, phdy$Left=='left') survkmI = survfit(SI~phdy$Gender, conf.int=plotconfidence) coxfitI = coxph(SI~phdy$Gender) sumcoxfitI = summary(coxfitI) coxleave = cox.hazard.mf(coxfitI, phdy$Gender) hazleave = km.hazard.mf(survkmI, conf.level=plotconfidence) sumcoxfitI km.plot(survkmI, xlab=xlab, ylab='K-M survivorship for leaving astrophysics', xlim=c(0,11)) hazard.mf.plot(coxleave, hazleave, xlab=xlab, ylab='Probability of leaving field (if postdoc)', ylim=c(0,0.25)) ``` #### Cumulative distributions ```{r} par(mar=mar) nmy = sum(phdy$Gender=='M') nfy = sum(phdy$Gender=='F') mfaccum = cumsum(jhist.hire$nmi) ffaccum = cumsum(jhist.hire$nfi) mleavecum = cumsum(jhist.left$nmi) fleavecum = cumsum(jhist.left$nfi) mr1cum = cumsum(jhist.r1$nmi) fr1cum = cumsum(jhist.r1$nfi) names(mfaccum) = jhist.hire$time names(ffaccum) = jhist.hire$time names(mleavecum) = jhist.left$time names(fleavecum) = jhist.left$time names(mr1cum) = jhist.r1$time names(fr1cum) = jhist.r1$time Y = length(mfaccum) mlimitcum = 0 flimitcum = 0 hp = coxhire$difhaz1 lp = coxleave$difhaz1 r1p = coxR1$difhaz1 for (i in 1:(Y-1)) { # assume that current postdocs will be hired/leave and correct appropriately #keep a running tally of the number of "limits" (still postdocs with post-PhD age<=current year) #and fractionally redistribute them to jobs using the hazard function mfaccum[(i+1):Y] = mfaccum[(i+1):Y] + hp[i] * mlimitcum ffaccum[(i+1):Y] = ffaccum[(i+1):Y] + hp[i] * flimitcum mleavecum[(i+1):Y] = mleavecum[(i+1):Y] + lp[i] * mlimitcum fleavecum[(i+1):Y] = fleavecum[(i+1):Y] + lp[i] * flimitcum mr1cum[(i+1):Y] = mr1cum[(i+1):Y] + r1p[i] * mlimitcum fr1cum[(i+1):Y] = fr1cum[(i+1):Y] + r1p[i] * flimitcum mlimitcum = mlimitcum*(1-hp[i]-lp[i]) + jhist.limit$nmi[i] flimitcum = flimitcum*(1-hp[i]-lp[i]) + jhist.limit$nfi[i] } time = jhist.hire$time plot(c(-1),c(-1), xlim=c(-0.02,11.9),ylim=c(-0.002,1.002), xlab=xlab, ylab='Cumulative fraction', yaxs='i',xaxs='i') frachired = (mfaccum+ffaccum)/(nfy+nmy) # includes completeness correction frachiredm = mfaccum/nmy frachiredf = ffaccum/nfy fracleft = (mleavecum+fleavecum)/(nfy+nmy) # includes completeness correction fracleftm = mleavecum/nmy fracleftf = fleavecum/nfy fracr1 = (mr1cum+fr1cum)/(nfy+nmy) # does not yet include completeness correction fracr1m = mr1cum/nmy fracr1f = fr1cum/nfy lines(-1:max(time), c(0,frachiredm),typ='s') lines(-1:max(time), c(1,1-fracleftm),typ='s') lines(-1:max(time), c(0,frachiredf),typ='s', col='red') lines(-1:max(time), c(1,1-fracleftf),typ='s', col='red') text(2,0.5,'postdoc/adjunct') text(7.5,0.9,'left astrophysics') text(9,0.25+0.03,'hired into long-term job') #text(10,0.07,'(professor)', cex=0.85) text(12.15, frachired['12'],signif(frachired['12'],2), cex=0.66, xpd=NA, col='darkgray') text(12.15,1-fracleft['12'], signif(1-fracleft['12'],2), cex=0.66, xpd=NA, col='darkgray') #text(12.15, fracr1['12'],signif(fracr1['12'],2), cex=0.66, xpd=NA, col='darkgray') #lines(-1:max(time), c(0,fracr1m),typ='s',lty=3) # R1 hires only #lines(-1:max(time), c(0,fracr1f),typ='s',lty=3,col='red') ``` #### Summarize statistics of interest for paper ```{r} gtab = table(phd$Gender) gtaby = table(phdy$Gender) out = c('') #out = c(out,paste(' # PhD programs: ', length(unique(phd$PhDInstitute)))) out = c(out,paste(' # total: ', ntotal)) out = c(out,paste(' # omitted: ', nomit)) out = c(out,paste('Total sample size:', n, '( M =',gtab[2], 'F =',gtab[1],')')) out = c(out,paste(' ', round(100*gtab[2]/n,1), '% ', round(100*gtab[1]/n,1), '% ', sep='')) outcome = table(phd$Left) nleftlater = table names(outcome)[names(outcome)=='hired'] = 'astro' out = c(out,paste('Career outcomes: ', outcome['postdoc']+outcome['adjunct'], 'still postdocs/adjuncts')) out = c(out,paste(' ', outcome['astro'], 'found long-term astronomy employment')) #, nlater, 'left after long-term job)')) out = c(out,paste(' ', outcome['left'], 'left astronomy')) pmleave = leavetable['M',2] / apply(leavetable,1,sum)[2] pmleaveci = mbinom$conf.int[1:2] pfleave = leavetable['F',2] / apply(leavetable,1,sum)[1] pfleaveci = fbinom$conf.int[1:2] out = c(out,paste('# of M that leave:', leavetable['M',2], '/', apply(leavetable,1,sum)[2])) out = c(out,paste('# of F that leave:', leavetable['F',2], '/', apply(leavetable,1,sum)[1])) out = c(out,paste('% of M that leave:', round(100*pmleave), '+',round(100*(pmleaveci[2]-pmleave)),'-', round(100*(pmleave - pmleaveci[1])))) out = c(out,paste('% of F that leave:', round(100*pfleave), '+',round(100*(pfleaveci[2]-pfleave)),'-', round(100*(pfleave - pfleaveci[1])))) out = c(out,paste('% of M that stay: ', 100-round(100*pmleave), '-',round(100*(pmleaveci[2]-pmleave)),'+', round(100*(pmleave - pmleaveci[1])))) out = c(out,paste('% of F that stay: ', 100-round(100*pfleave), '-',round(100*(pfleaveci[2]-pfleave)),'+', round(100*(pfleave - pfleaveci[1])))) out = c(out,paste('Fisher significance of M/F leave rates different: p =', signif(fisher$p.value,2))) out = c(out,paste(' Significance of different outcome employer classes:',signif(ch2$p.value,3))) out = c(out,paste(' Significance of different outcome profession classes:',signif(ch1$p.value,3))) out = c(out,'') out = c(out,paste(' # with no year: ', nunkyear)) out = c(out,paste('Sample size w/year:', ny, '( M =',gtaby[2], 'F =',gtaby[1],')')) out = c(out,paste(' ', round(100*gtaby[2]/ny,1), '% ', round(100*gtaby[1]/ny,1),'%', sep='')) out = c(out,paste('Female mean time: ', round(means[1],2), '+/-', round(ses[1],2))) out = c(out,paste(' Male mean time: ', round(means[2],2), '+/-', round(ses[2],2))) out = c(out,paste(' Gap: ', round(gap,2), '+/-', round(gapunc,2))) out = c(out,paste('% hired within 4 years:',signif(100*frachired['4'],3),'%')) out = c(out,paste(' (M:',signif(100*frachiredm['4'],3),'% F:',signif(100*frachiredf['4'],3),'% )')) out = c(out,paste('% hired within 12 years:',signif(100*frachired['12'],3),'%')) out = c(out,paste(' (M:',signif(100*frachiredm['12'],3),'% F:',signif(100*frachiredf['12'],3),'% )')) out = c(out,paste('% left within 12 years:',signif(100*fracleft['12'],3),'%')) out = c(out,paste(' (M:',signif(100*fracleftm['12'],3),'% F:',signif(100*fracleftf['12'],3),'% )')) diff = means[2]-means[1] differr = (ttest$conf.int[2]-ttest$conf.int[1])/2 differr2 = sqrt(ses[1]^2+ses[2]^2) out = c(out,paste(' Difference: ', round(diff,2), '+/-', round(differr,2), ' (p =',signif(ttest$p.value,2),')')) # paste(' Difference: ', round(diff,2), '+/-', round(differr2,2)) out = c(out, 'Significance of M-F postdoc time distribution difference:') out = c(out,paste(' Student t-test: p =', signif(ttest$p.value,2), ' (F18 ruled out at p =',signif(ttest.vsF18$p.value,2),')')) out = c(out,paste(' Wilcoxon test: p =', signif(wtest$p.value,2))) out = c(out,paste(' K-S test: p =', signif(kstest$p.value,2))) out = c(out,paste(' A-D test: p =', signif(adtest$ad[1,3],2))) out = c(out,'') out = c(out,paste('Survival analysis for astronomy hiring:')) out = c(out,paste('Cox H(F/M) = ', round(coxhire$b,2), ' + ', round(coxhire$berr[1],2), ' - ', round(-coxhire$berr[2],2))) out = c(out,paste('Cox significance of H(F/M) != 1: p =', signif(sumcoxfit$coefficients[5],2))) out = c(out,'') out = c(out,'Survival analysis for leaving astronomy:') out = c(out,paste('Cox L(F/M) = ', round(coxleave$b,2), ' + ', round(coxleave$berr[1],2), ' - ', round(-coxleave$berr[2],2))) out = c(out,paste('Cox significance of L(F/M) != 1: p =', signif(sumcoxfitI$coefficients[5],2))) writeLines(out) ```