To evaluate genotype-based prediction approaches, I am using a range of phenotypes measured in the UK Biobank and TEDS samples.
Phenotype:
Below I provide a description of how each phenotype was derived and some descriptive statistics.
The depression phenotype was shared with me by Kylie Glanville.
Preparation of Depression phenotype
######
# Modify depression phenotype file so IDs match genotypic data
######
module add apps/R/3.6.0
R
source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Phenotype_prep.config')
library(data.table)
pheno<-fread(paste0(UKBB_output, '/Phenotype/Depression/ever_depressed_pheno_final.txt'))
fam<-fread(paste0(UKBB_output,'/UKBB/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr22.fam')
names(fam)[1:2]<-c('FID','IID')
pheno_fam<-merge(pheno, fam, by=c('IID'))
pheno_new<-pheno_fam[,c('FID.y','IID','Depressed.Ever.Clean'), with=F]
names(pheno_new)[1]<-'FID'
fwrite(pheno_new, paste0(UKBB_output,'/Phenotype/Depression/ever_depressed_pheno_final.UpdateIDs.txt'), sep=' ', na='NA')
q()
n
The Intelligence phenotype is derived from the Fluid.intelligence.score.0.0 (f.20191.0.0) variable.
Preparation of Intelligence phenotype
#####
# Extract intelligence variable, update IDs and calculate descriptives
#####
module add apps/R/3.6.0
R
source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Phenotype_prep.config')
# Read in the field finder to find he code for the phenotype of interest.
field_finder<-read.table(paste0(ukbb_pheno_file,'/ukb18177_glanville_field_finder.txt'), stringsAsFactors=F)
field_finder[grepl('Fluid.intelligence', field_finder$V3),]
# The Fluid intelligence variable has a code of f.20191.0.0 and is called Fluid.intelligence.score.0.0
# Identify which column in the phenotype file contains the relevant code
library(data.table)
pheno_header<-names(data.frame(fread(cmd=paste0('head -n 1 ',ukbb_pheno_file,'/ukb18177_glanville_phenotypes.txt'))))
rel_cols<-c(1,which(grepl('f.20191.0.0',pheno_header)))
# Read in the IID and fluid intelligence columns
pheno<-data.frame(fread(cmd=paste0('cut -f ',paste(rel_cols,collapse=','),' ',ukbb_pheno_file,'/ukb18177_glanville_phenotypes.txt')))
names(pheno)<-c('IID','Fluid.intelligence.score')
# Remove individuals with missing phenotype data
pheno<-pheno[complete.cases(pheno),]
# Calculate descriptive statistics for the phenotype
library(e1071)
descriptives<-data.frame(Phenotype='Fluid.intelligence.score',
Code='f.20127.0.0',
Mean=mean(pheno$Fluid.intelligence.score),
SD=sd(pheno$Fluid.intelligence.score),
Min=min(pheno$Fluid.intelligence.score),
Max=max(pheno$Fluid.intelligence.score),
Skewness=skewness(pheno$Fluid.intelligence.score),
N=dim(pheno)[1])
system(paste0('mkdir ',UKBB_output,'/Phenotype/Intelligence'))
write.table(descriptives,paste0(UKBB_output,'/Phenotype/Intelligence/Descriptives.txt'), row.names=F, quote=F)
# Plot the distribution
png(paste0(UKBB_output,'/Phenotype/Intelligence/Histogram.png'))
hist(pheno$Fluid.intelligence.score)
dev.off()
# Write out the phenotype in PLINK format.
pheno_plink<-data.frame(FID=pheno$IID,
IID=pheno$IID,
Fluid.intelligence.score=pheno$Fluid.intelligence.score)
fwrite(pheno_plink, paste0(UKBB_output,'/Phenotype/Intelligence/UKBB_Fluid.intelligence.score.pheno'), sep='\t')
q()
n
Descriptives of Intelligence phenotype
Phenotype | Code | Mean | SD | Min | Max | Skewness | N |
---|---|---|---|---|---|---|---|
Fluid.intelligence.score | f.20127.0.0 | 5.434 | 2.039 | 0 | 13 | 0.155 | 123656 |
The Height phenotype is derived from the Standing.height.0.0 (f.50.0.0) variable.
Preparation of Height phenotype
#####
# Extract Height variable, update IDs and calculate descriptives
#####
module add apps/R/3.6.0
R
source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Phenotype_prep.config')
# Read in the field finder to find he code for the phenotype of interest.
field_finder<-read.table(paste0(ukbb_pheno_file,'/ukb18177_glanville_field_finder.txt'), stringsAsFactors=F)
field_finder[grepl('height', field_finder$V3),]
# The best height variable has a code of f.50.0.0 and is called Standing.height.0.0
# Identify which column in the phenotype file contains the relevant code
library(data.table)
pheno_header<-names(data.frame(fread(cmd=paste0('head -n 1 ',ukbb_pheno_file,'/ukb18177_glanville_phenotypes.txt'))))
rel_cols<-c(1,which(grepl('f.50.0.0',pheno_header)))
# Read in the IID and fluid intelligence columns
pheno<-data.frame(fread(cmd=paste0('cut -f ',paste(rel_cols,collapse=','),' ',ukbb_pheno_file,'/ukb18177_glanville_phenotypes.txt')))
names(pheno)<-c('IID','Height')
# Remove individuals with missing phenotype data
pheno<-pheno[complete.cases(pheno),]
# Calculate descriptive statistics for the phenotype
library(e1071)
descriptives<-data.frame(Phenotype='Standing.height.0.0',
Code='f.50.0.0',
Mean=mean(pheno$Height),
SD=sd(pheno$Height),
Min=min(pheno$Height),
Max=max(pheno$Height),
Skewness=skewness(pheno$Height),
N=dim(pheno)[1])
system(paste0('mkdir ',UKBB_output,'/Phenotype/Height'))
write.table(descriptives,paste0(UKBB_output,'/Phenotype/Height/Descriptives.txt'), row.names=F, quote=F)
# Plot the distribution
png(paste0(UKBB_output,'/Phenotype/Height/Histogram.png'))
hist(pheno$Height)
dev.off()
# Remove outliers
pheno_no_outlier<-pheno[(pheno$Height < mean(pheno$Height)+(3*sd(pheno$Height))) & (pheno$Height > mean(pheno$Height)-(3*sd(pheno$Height))),]
# Plot the distribution
png(paste0(UKBB_output,'/Phenotype/Height/Histogram_noOutlier.png'))
hist(pheno_no_outlier$Height)
dev.off()
# Write out the phenotype in PLINK format.
pheno_plink<-data.frame(FID=pheno_no_outlier$IID,
IID=pheno_no_outlier$IID,
Height=pheno_no_outlier$Height)
fwrite(pheno_plink, paste0(UKBB_output,'/Phenotype/Height/UKBB_Height.pheno'), sep='\t')
q()
n
Descriptives of Height phenotype
Phenotype | Code | Mean | SD | Min | Max | Skewness | N |
---|---|---|---|---|---|---|---|
Standing.height.0.0 | f.50.0.0 | 168.443 | 9.279 | 75 | 209 | 0.158 | 500004 |
The BMI phenotype is derived from the Body.mass.index.BMI.0.0 (f.21001.0.0) variable.
Preparation of BMI phenotype
#####
# Extract BMI variable, update IDs and calculate descriptives
#####
module add apps/R/3.6.0
R
source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Phenotype_prep.config')
# Read in the field finder to find he code for the phenotype of interest.
field_finder<-read.table(paste0(ukbb_pheno_file,'/ukb18177_glanville_field_finder.txt'), stringsAsFactors=F)
field_finder[grepl('bmi', field_finder$V3),]
# The best height variable has a code of f.21001.0.0 and is called Body.mass.index.BMI.0.0
# Identify which column in the phenotype file contains the relevant code
library(data.table)
pheno_header<-names(data.frame(fread(cmd=paste0('head -n 1 ',ukbb_pheno_file,'/ukb18177_glanville_phenotypes.txt'))))
rel_cols<-c(1,which(grepl('f.21001.0.0',pheno_header)))
# Read in the IID and fluid intelligence columns
pheno<-data.frame(fread(cmd=paste0('cut -f ',paste(rel_cols,collapse=','),' ',ukbb_pheno_file,'/ukb18177_glanville_phenotypes.txt')))
names(pheno)<-c('IID','BMI')
# Remove individuals with missing phenotype data
pheno<-pheno[complete.cases(pheno),]
# Calculate descriptive statistics for the phenotype
library(e1071)
descriptives<-data.frame(Phenotype='Body.mass.index.BMI.0.0',
Code='f.50.0.0',
Mean=mean(pheno$BMI),
SD=sd(pheno$BMI),
Min=min(pheno$BMI),
Max=max(pheno$BMI),
Skewness=skewness(pheno$BMI),
N=dim(pheno)[1])
system(paste0('mkdir ',UKBB_output,'/Phenotype/BMI'))
write.table(descriptives,paste0(UKBB_output,'/Phenotype/BMI/Descriptives.txt'), row.names=F, quote=F)
# Plot the distribution
png(paste0(UKBB_output,'/Phenotype/BMI/Histogram.png'))
hist(pheno$BMI)
dev.off()
# Remove outliers
pheno_no_outlier<-pheno[(pheno$BMI < mean(pheno$BMI)+(3*sd(pheno$BMI))) & (pheno$BMI > mean(pheno$BMI)-(3*sd(pheno$BMI))),]
# Plot the distribution
png(paste0(UKBB_output,'/Phenotype/BMI/Histogram_noOutlier.png'))
hist(pheno_no_outlier$BMI)
dev.off()
# Write out the phenotype in PLINK format.
pheno_plink<-data.frame(FID=pheno_no_outlier$IID,
IID=pheno_no_outlier$IID,
BMI=pheno_no_outlier$BMI)
fwrite(pheno_plink, paste0(UKBB_output,'/Phenotype/BMI/UKBB_BMI.pheno'), sep='\t')
q()
n
Descriptives of BMI phenotype
Phenotype | Code | Mean | SD | Min | Max | Skewness | N |
---|---|---|---|---|---|---|---|
Body.mass.index.BMI.0.0 | f.50.0.0 | 27.433 | 4.803 | 12.121 | 74.684 | 1.096 | 499438 |
T2D phenotypes were shared by Saskia Hagenaars and Anna Fürtjes.
Preparation of CAD phenotype
#####
# Modify CAD phenotype file so IDs match genotypic data
#####
module add apps/R/3.6.0
R
source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Phenotype_prep.config')
library(data.table)
pheno<-fread(paste0(UKBB_output, '/Phenotype/CAD/cad_only_111119.txt'))
names(pheno)[1]<-'IID'
fam<-fread(paste0(UKBB_output,'/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr22.fam'))
names(fam)[1:2]<-c('FID','IID')
pheno_fam<-merge(pheno, fam, by=c('IID'))
pheno_new<-pheno_fam[,c('FID','IID','CAD_all'), with=F]
fwrite(pheno_new, paste0(UKBB_output,'/Phenotype/CAD/cad_only_111119.UpdateIDs.txt'), sep=' ', na='NA')
q()
n
T2D phenotypes were shared by Saskia Hagenaars and Anna Fürtjes.
Preparation of T2D phenotype
#####
# Modify T2D phenotype file so IDs match genotypic data
#####
module add apps/R/3.6.0
R
source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Phenotype_prep.config')
library(data.table)
pheno<-fread(paste0(UKBB_output, '/Phenotype/T2D/t2d_only_111119.txt'))
names(pheno)[1]<-'IID'
fam<-fread(paste0(UKBB_output,'/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr22.fam'))
names(fam)[1:2]<-c('FID','IID')
pheno_fam<-merge(pheno, fam, by=c('IID'))
pheno_new<-pheno_fam[,c('FID','IID','t2d_all'), with=F]
fwrite(pheno_new, paste0(UKBB_output,'/Phenotype/T2D/t2d_only_111119.UpdateIDs.txt'), sep=' ', na='NA')
q()
n
Phenotypes were shared by Kylie Glanville.
Preparation of Autoimmune disorder phenotype
#####
# Modify Autoimmune Disorder phenotype file so IDs match genotypic data
#####
module add apps/R/3.6.0
R
source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Phenotype_prep.config')
library(data.table)
pheno<-fread(paste0(UKBB_output, '/Phenotype/AutoImmune/new_autoimmune_all'))
pheno$FID<-NULL
fam<-fread(paste0(UKBB_output,'/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr22.fam'))
names(fam)[1:2]<-c('FID','IID')
pheno_fam<-merge(pheno, fam, by=c('IID'))
# Extract probable phnotypes only
pheno_new<-pheno_fam[,c('FID','IID',names(pheno_fam)[grepl('_prob', names(pheno_fam))]), with=F]
# Identify individuals that don't have any autoimmune disorders
pheno_new_control<-pheno_new[which(rowSums(pheno_new[,-1:-2]) == 0),]
# Create a pheno file for Crohns MultiScler RheuArth
pheno_name<-c('IBD','MultiScler','RheuArth')
pheno_col<-c('ibd_prob',"ms_prob","arthritis_prob")
for(i in 1:3){
pheno_new_i_case<-pheno_new[pheno_new[[pheno_col[i]]] == 1,]
pheno_clean_i<-rbind(pheno_new_i_case, pheno_new_control)
pheno_clean_i<-pheno_clean_i[,c('FID','IID',pheno_col[i]), with=F]
system(paste0('mkdir ',UKBB_output,'/Phenotype/',pheno_name[i]))
fwrite(pheno_clean_i, paste0(UKBB_output,'/Phenotype/',pheno_name[i],'/UKBB.',pheno_name[i],'.txt'), sep=' ', na='NA', quote=F)
}
q()
n
Preparation of cancer phenotype
#####
# Extract Height variable, update IDs and calculate descriptives
#####
module add apps/R/3.6.0
R
source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Phenotype_prep.config')
# Read in the field finder to find the code for the phenotype of interest.
field_finder<-read.table(paste0(ukbb_pheno_file,'/ukb18177_glanville_field_finder.txt'), stringsAsFactors=F)
field_finder[grepl('Cancer|cancer', field_finder$V3),]
# The self reported cancer variable code is f.20001
# The sex variable code is f.31.0.0
# Identify which column in the phenotype file contains the relevant code
library(data.table)
pheno_header<-names(data.frame(fread(cmd=paste0('head -n 1 ',ukbb_pheno_file,'/ukb18177_glanville_phenotypes.txt'))))
rel_cols<-c(1,which(grepl('f.31.0.0|f.20001',pheno_header)))
# Read in the IID, sex, and cancer columns
pheno<-data.frame(fread(cmd=paste0('cut -f ',paste(rel_cols,collapse=','),' ',ukbb_pheno_file,'/ukb18177_glanville_phenotypes.txt')))
names(pheno)[1:2]<-c('IID','sex')
names(pheno)<-gsub('X..','',names(pheno))
names(pheno)<-gsub('\\.\\.','',names(pheno))
# Update IDs to match genetic data
fam<-fread(paste0(UKBB_output,'/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr22.fam'))
names(fam)[1:2]<-c('FID','IID')
pheno<-merge(fam[,c('FID','IID')],pheno, by=c('IID'))
# Identify individuals with prostate cancer ID '1044' in any column
prostate<-NULL
for(i in names(pheno)[-1:-3]){
prostate<-c(prostate, which(pheno[[i]] == '1044'))
prostate<-unique(prostate)
}
pheno$prostate_cancer<-NA
pheno$prostate_cancer[pheno$sex == 1]<-0
pheno$prostate_cancer[prostate]<-1
# Identify individuals with breast cancer ID '1002' in any column
breast<-NULL
for(i in names(pheno)[-1:-3]){
breast<-c(breast, which(pheno[[i]] == '1002'))
breast<-unique(breast)
}
pheno$breast_cancer<-NA
pheno$breast_cancer[pheno$sex == 0]<-0
pheno$breast_cancer[breast]<-1
# Write out the phenotype in PLINK format.
pheno_plink_prostate<-data.frame(FID=pheno$FID,
IID=pheno$IID,
prostate_cancer=pheno$prostate_cancer)
pheno_plink_prostate<-pheno_plink_prostate[complete.cases(pheno_plink_prostate),]
pheno_plink_breast<-data.frame(FID=pheno$FID,
IID=pheno$IID,
breast_cancer=pheno$breast_cancer)
pheno_plink_breast<-pheno_plink_breast[complete.cases(pheno_plink_breast),]
dir.create(paste0(UKBB_output,'/Phenotype/Prostate_Cancer/'))
dir.create(paste0(UKBB_output,'/Phenotype/Breast_Cancer/'))
fwrite(pheno_plink_prostate, paste0(UKBB_output,'/Phenotype/Prostate_Cancer/UKBB_Prostate_Cancer.pheno'), sep='\t')
fwrite(pheno_plink_breast, paste0(UKBB_output,'/Phenotype/Breast_Cancer/UKBB_Breast_Cancer.pheno'), sep='\t')
q()
n
The UK Biobank sample is very large making the computation of the various scores and derivation of the models computationally intensive. We do not need such large samples to compare PRS methods, and it would also be useful to have a non-overlapping subset of UKBiobank which can be used as a reference sample.
Subsetting phenotypic data
#####
# Idenitfy subset of UK biobank with phenotypic data and European ancestry
#####
module add apps/R/3.6.0
R
source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Phenotype_prep.config')
library(data.table)
pheno<-c('Depression','Intelligence','BMI','Height','T2D','CAD','IBD','MultiScler','RheuArth','Breast_Cancer','Prostate_Cancer')
pheno_file<-c('ever_depressed_pheno_final.UpdateIDs.txt','UKBB_Fluid.intelligence.score.UpdateIDs.pheno','UKBB_BMI.score.UpdateIDs.pheno','UKBB_Height.score.UpdateIDs.pheno','t2d_only_111119.UpdateIDs.txt','cad_only_111119.UpdateIDs.txt','UKBB.IBD.txt','UKBB.MultiScler.txt','UKBB.RheuArth.txt','UKBB_Breast_Cancer.pheno','UKBB_Prostate_Cancer.pheno')
pheno_list<-list()
for(i in 1:length(pheno)){
pheno_list[[pheno[i]]]<-fread(paste0(UKBB_output,'/Phenotype/',pheno[i],'/',pheno_file[i]))
}
pheno_all<-Reduce(function(...) merge(..., all=T, by=c('FID','IID')), pheno_list)
# We want to retain all individuals that are cases, and those that have complete data.
pheno_all_2<-pheno_all[complete.cases(pheno_all[,!grepl('cancer',names(pheno_all)),with=F]) | pheno_all$Depressed.Ever.Clean == 1 | pheno_all$t2d_all == 1 | pheno_all$CAD_all == 1 | pheno_all$ibd_prob == 1 | pheno_all$ms_prob == 1 | pheno_all$arthritis_prob == 1 | pheno_all$breast_cancer == 1 | pheno_all$prostate_cancer == 1,]
# Extract individuals with EUR ancestry
EUR<-fread(paste0(UKBB_output,'/Projected_PCs/AllAncestry/UKBB.w_hm3.AllAncestry.EUR.keep'))
pheno_all_2_EUR<-pheno_all_2[(pheno_all_2$IID %in% EUR$V2),]
# Extract individuals that survive QC
keep<-fread('/users/k1806347/brc_scratch/Analyses/PRS_comparison/UKBB_outcomes_for_prediction/ukb18177_glanville_post_qc_id_list.UpdateIDs.fam')
pheno_all_2_EUR<-pheno_all_2_EUR[(pheno_all_2_EUR$IID %in% keep$V2),]
# Calculate the number of individuals surviving genotype QC and ancestry check
sum(EUR$V2 %in% keep$V2) # 379437
# Remove individuals in the UKB reference
UKB_ref<-fread('/users/k1806347/brc_scratch/Data/UKBB/UKBB_ref/genotype/UKBB.noPheno.EUR.10K.chr22.fam')
pheno_all_2_EUR<-pheno_all_2_EUR[!(pheno_all_2_EUR$IID %in% UKB_ref$V2),]
# This seems circular as the phenotype file was used to select UKB individuals in the reference. This is due to the addition of breast cancer and prostate cancer phenotype. Using the original set of reference individuals losses <150 cases for each phenotype and saves recreating UKB reference files.
# There are still >100k individuals. Write out files for each phenotype, retaining individuals with complete data, with a maximum of 50000 individuals for quantitative outcomes.
system(paste0('mkdir ',UKBB_output,'/Phenotype/PRS_comp_subset'))
pheno_N<-NULL
pheno_any<-NULL
for(i in 1:length(pheno)){
pheno_all_2_EUR_i<-pheno_all_2_EUR[,c('FID','IID',names(pheno_all_2_EUR)[2+i]),with=F]
pheno_all_2_EUR_i<-pheno_all_2_EUR_i[complete.cases(pheno_all_2_EUR_i),]
if(sum(pheno_all_2_EUR_i[[3]] == 0 | pheno_all_2_EUR_i[[3]] == 1) == length(pheno_all_2_EUR_i[[3]])){
pheno_all_2_EUR_i_case<-pheno_all_2_EUR_i[pheno_all_2_EUR_i[[3]] == 1,]
pheno_all_2_EUR_i_case<-pheno_all_2_EUR_i_case[1:25000,]
pheno_all_2_EUR_i_case<-pheno_all_2_EUR_i_case[complete.cases(pheno_all_2_EUR_i_case),]
pheno_all_2_EUR_i_con<-pheno_all_2_EUR_i[pheno_all_2_EUR_i[[3]] == 0,]
pheno_all_2_EUR_i_con<-pheno_all_2_EUR_i_con[1:(50000 - dim(pheno_all_2_EUR_i_case)[1]),]
pheno_all_2_EUR_i<-rbind(pheno_all_2_EUR_i_case, pheno_all_2_EUR_i_con)
pheno_N<-rbind(pheno_N,data.frame(Pheno=pheno[i],
N=length(pheno_all_2_EUR_i[[3]]),
N_case=sum(pheno_all_2_EUR_i[[3]] == 1,na.rm=T),
N_con=sum(pheno_all_2_EUR_i[[3]] == 0,na.rm=T)))
} else {
pheno_all_2_EUR_i<-pheno_all_2_EUR_i[1:50000,]
pheno_N<-rbind(pheno_N,data.frame(Pheno=pheno[i],
N=length(pheno_all_2_EUR_i[[3]]),
N_case=NA,
N_con=NA))
}
pheno_any<-rbind(pheno_any,pheno_all_2_EUR_i[,c('FID','IID'),with=F])
pheno_any<-pheno_any[!duplicated(pheno_any),]
write.table(pheno_all_2_EUR_i, paste0(UKBB_output,'/Phenotype/PRS_comp_subset/UKBB.',pheno[i],'.txt'), col.names=T, row.names=F, quote=F)
}
write.csv(pheno_N, paste0(UKBB_output,'/Phenotype/PRS_comp_subset/Pheno_N.csv'), row.names=F, quote=F)
# Calculate the number of individuals across all samples
dim(pheno_any) # 104278