Preprint

This document provides code and key figures and tables for the following preprint:

Citation:
Pain, O. (2025). Leveraging Global Genetics Resources to Enhance Polygenic Prediction Across Ancestrally Diverse Populations. medRxiv. https://doi.org/10.1101/2025.03.27.25324773

Document overview:

Data preparation
Main analysis
TL-PRS analysis
Sensitivity analyses

Data Preparation

UKB

This section will describe the preparation of the UKB data for this study. We will need to separate UKB participants into ancestral groups (AFR, EAS, and EUR). Then we will need to prepare phenotype data for traits that overlap with the BBJ and UGR samples. Then we will need to split EUR UKB participants into training and testing subsets. We will then perform GWAS in the training subset, and evaluate PGS in the testing subset.

Ancestry inference

We will perform this using the GenoPred pipeline. We will need to prepare the configuration files before running the pipeline.

Show code

Create symlinks

We will create symlinks to the imputed genotype data for UKB. We will use the pgen format data for computationl efficiency and those restricted to MAF >= 1% and INFO >= 0.4. We are using genetic data that is not application specific, so the data doesn’t need to be reprocessed for each application. Therefore we will use row number IDs for the .psam file so they can be connected to application specific data downstream.

mkdir -p /users/k1806347/oliverpainfel/Data/ukb/ukb_symlinks

# pgen and pvar files
for chr in $(seq 1 22);do
  for file in $(echo pgen pvar);do
    ln -s /datasets/ukbiobank/June2017/Imputed/ukb_imp_chr${chr}_v3_MAF1_INFO4.${file} /users/k1806347/oliverpainfel/Data/ukb/ukb_symlinks/ukb_imp_maf1_info4.chr${chr}.${file}
  done
done

# Make .psam 
n = 487409
psam <- data.frame(FID = 1:487409,
                   IID = 1:487409)
names(psam)[1]<-'#FID'
write.table(psam, '/users/k1806347/oliverpainfel/Data/ukb/ukb_symlinks/rownumber.psam', col.names=T, row.names = F, quote = F)

for chr in $(seq 1 22);do
  ln -s /users/k1806347/oliverpainfel/Data/ukb/ukb_symlinks/rownumber.psam /users/k1806347/oliverpainfel/Data/ukb/ukb_symlinks/ukb_imp_maf1_info4.chr${chr}.psam
done

Create list of unrelated individuals

library(ukbkings)
library(data.table)

psam<-fread('/scratch/prj/ukbiobank/recovered/ukb82087/imputed/ukb82087_imp_chr1_MAF1_INFO4_v1.psam')
psam$rn<-1:nrow(psam)

project_dir <- "/datasets/ukbiobank/ukb82087"
greedy_related <- "/scratch/prj/ukbiobank/recovered/KCL_Data/Software/tools/GreedyRelated-master-v1.2.1/GreedyRelated"

# Create a list of unrelated individuals irrespective of a phenotype
psam_unrel_all <- psam[!(
  psam$IID %in% bio_gen_related_remove(
    project_dir = project_dir,
    greedy_related = greedy_related,
    thresh = 0.044,
    seed = 1
  )$eid
), ]

dir.create('/users/k1806347/oliverpainfel/Data/ukb/phenotypes')

write.table(psam_unrel_all$IID, '/users/k1806347/oliverpainfel/Data/ukb/phenotypes/unrelated.txt', row.names=F, col.names=F, quote=F)
write.table(psam_unrel_all$rn, '/users/k1806347/oliverpainfel/Data/ukb/phenotypes/unrelated.row_number.txt', row.names=F, col.names=F, quote=F)

Create target_list

mkdir -p /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/basic

target_list <- data.frame(
  name='ukb',
  path='/users/k1806347/oliverpainfel/Data/ukb/ukb_symlinks/ukb_imp_maf1_info4',
  type='plink2',
  indiv_report=F,
  unrel='/users/k1806347/oliverpainfel/Data/ukb/phenotypes/unrelated.row_number.txt'
)

write.table(target_list, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/basic/target_list.txt', col.names=T, row.names=F, quote=F)

Create configfile

# Create config file
conf <- c(
  'outdir: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/output',
  'config_file: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/basic/config.yaml',
  'target_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/basic/target_list.txt'
)

write.table(conf, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/basic/config.yaml', col.names = F, row.names = F, quote = F)

Run pipeline

cd /users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline
git describe --tags
#v2.2.2-213-g2f05853

snakemake \
  --profile slurm \
  --use-conda \
  --configfile=/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/basic/config.yaml \
  outlier_detection -n

Phenotype extraction

We will use the same 33 quantitative traits that were used in the PRS-CSx paper (Supp Table 10 of PRS-CSx paper). We will use ukbkings to extract the phenotypes, then remove related individuals, split EUR into training and testing subsets, and adjust EUR training phenotypes for covariates.

Show code

mkdir /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx

library(ukbkings)
library(dplyr)
library(stringr)
library(data.table)

# create data.frame showing variables used by prscsx
prscsx_fields<-c('30620','30600','30610','30650','30160','21001','21002','30710','30680','4079','30150','30740','30750','30760','50','30030','30020','30780','30120','30050','30060','30040','30130','30140','30080','30010','30700','4080','30690','30860','30870','30000','30730')
prscsx_trait<-c('Alanine aminotransferase','Albumin','Alkaline phosphatase','Aspartate transaminase','Basophil','Body mass index','Body weight','C-reactive protein','Calcium','Diastolic blood pressure','Eosinophil','Glucose','HbA1c','HDL-cholesterol','Height','Hematocrit','Hemoglobin','LDL-cholesterol','Lymphocyte','Mean corpuscular hemoglobin','Mean corpuscular hemoglobin concentration','Mean corpuscular volume','Monocyte','Neutrophil','Platelet','Red blood cell','Serum creatinine','Sytolic blood pressure','Total cholesterol','Total protein','Triglycerides','White blood cell','γ-glutamyl transpeptidase')
prscsx_labels<-c('ALT','ALB','ALP','AST','BAS','BMI','BWT','CRP','Ca','DBP','EOS','GLC','HbA1c','HDL','HT','HCT','HB','LDL','LYM','MCH','MCHC','MCV','MON','NEU','PLT','RBC','CR','SBP','TC','TP','TG','WBC','GGT')

prscsx_dat<-data.frame(
  trait=prscsx_trait,
  labels=prscsx_labels,
  field=prscsx_fields
)

dir.create('/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx')
write.csv(prscsx_dat, '/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/prscsx_data.csv', row.names = F)
write.table(prscsx_labels, '/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/prscsx_labels.txt', col.names=F, row.names = F, quote=F)

# Extract outcomes from UKB (project ukb82087)
project_dir <- "/datasets/ukbiobank/ukb82087"

system('rm /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/prscsx_field_subset.txt')
f <- bio_field(project_dir)
f %>%
    select(field, name) %>%
    filter(str_detect(field, paste(paste0("^", prscsx_dat$field, '-'), collapse='|'))) %>%
    bio_field_add("/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/prscsx_field_subset.txt")

bio_phen(
    project_dir,
    field = "/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/prscsx_field_subset.txt",
    out = "/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/prscsx_field_subset"
)

system("ls -lh /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/prscsx_field_subset.rds")
df <- readRDS("/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/prscsx_field_subset.rds")

# Take the first observation of each outcome
library(tidyr)
df_long <- df %>%
  pivot_longer(cols = names(df)[!grepl('eid', names(df))], names_to = "variable", values_to = "outcome") %>%
  drop_na(outcome)
df_long$variable<-gsub('-.*','', df_long$variable)
df_long<-df_long[!duplicated(df_long[,c('eid','variable')]),]

library(data.table)

for(i in 1:nrow(prscsx_dat)){
  tmp <- df_long[df_long$variable == prscsx_dat$field[i],]
  tmp <- data.frame(
    eid = tmp$eid,
    outcome = tmp$outcome
  )
  
  fwrite(
    tmp,
    paste0(
      '/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/',
      prscsx_dat$label[i],
      '.txt'
    ),
    row.names = F,
    quote = F,
    na = 'NA',
    sep = '\t'
  )
}

# Read in ancestry inference results to determine sample size per population
# Use ancestry information from GenoPred
keep_files<-list.files(path = '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/output/ukb/pcs/within_sample/', pattern = '.keep')

pop_dat<-NULL
for(i in keep_files){
  tmp<-fread(paste0('/users/k1806347/oliverpainfel/Data/ukb/GenoPred/output/ukb/pcs/within_sample/', i))
  names(tmp)<-c('FID','IID')
  tmp$POP<-gsub('.keep','', gsub('ukb.outlier_detection.','',i))
  pop_dat<-rbind(pop_dat, tmp)
}

# Update row number IDs to project specific IDs
psam<-fread('/scratch/prj/ukbiobank/recovered/ukb82087/imputed/ukb82087_imp_chr1_MAF1_INFO4_v1.psam')
psam$rn<-1:nrow(psam)
psam<-psam[,c('IID','rn'), with = F]

pop_dat$FID<-NULL
pop_dat<-merge(pop_dat, psam, by.x='IID', by.y='rn')
pop_dat<-data.frame(
  eid=pop_dat$IID.y,
  POP=pop_dat$POP
)

# Merge ancestry info with phenotype data
df_short <- dcast(df_long, eid ~ variable, value.var = "outcome")
df_short<-merge(df_short, pop_dat, by='eid')

# Remove related individuals
greedy_related <- "/scratch/prj/ukbiobank/recovered/KCL_Data/Software/tools/GreedyRelated-master-v1.2.1/GreedyRelated"
rel<-bio_gen_related_remove(
      project_dir = project_dir,
      greedy_related = greedy_related,
      keep = df_short$eid,
      thresh = 0.044,
      seed = 1
    )$eid

df_short_unrel<-df_short[!(df_short$eid %in% rel),]

n_table<-NULL
for(i in 1:nrow(prscsx_dat)){
  for(j in unique(pop_dat$POP[!is.na(pop_dat$POP)])){
    tmp<-data.frame(
      trait=prscsx_dat$trait[i],
      labels=prscsx_dat$label[i],
      field=prscsx_dat$field[i],
      population=j,
      n=sum(!is.na(df_short[[prscsx_dat$field[i]]][df_short$POP == j])),
      n_unrel=sum(!is.na(df_short_unrel[[prscsx_dat$field[i]]][df_short_unrel$POP == j]))
    )
    n_table<-rbind(n_table, tmp)
  }
}

write.csv(n_table, '/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/n_table')

# Define training subset for EUR
df_short_unrel_eur<-df_short_unrel[df_short_unrel$POP == 'EUR',]
set.seed(1)
train_size <- floor(0.8 * nrow(df_short_unrel_eur))
train_indices <- sample(seq_len(nrow(df_short_unrel_eur)), size = train_size)

df_short_unrel_eur_train<-df_short_unrel_eur[train_indices,]
df_short_unrel_eur_test<-df_short_unrel_eur[-train_indices,]

n_table_eur<-NULL
for(i in 1:nrow(prscsx_dat)){
  tmp<-data.frame(
    trait=prscsx_dat$trait[i],
    labels=prscsx_dat$label[i],
    field=prscsx_dat$field[i],
    n_train=sum(!is.na(df_short_unrel_eur_train[[prscsx_dat$field[i]]])),
    n_test=sum(!is.na(df_short_unrel_eur_test[[prscsx_dat$field[i]]]))
  )
  n_table_eur<-rbind(n_table_eur, tmp)
}

write.csv(n_table_eur, '/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/n_table_eur')

df_short_unrel$POP[df_short_unrel$eid %in% df_short_unrel_eur_train$eid]<-'EUR_train'
df_short_unrel$POP[df_short_unrel$eid %in% df_short_unrel_eur_test$eid]<-'EUR_test'

# Output phenotype data for each population
for(i in 1:nrow(prscsx_dat)){
  for(j in unique(df_short_unrel$POP)){
    tmp<-df_short_unrel[df_short_unrel$POP == j,]
    tmp <- data.frame(
      FID = tmp$eid,
      IID = tmp$eid,
      outcome = tmp[[prscsx_dat$field[i]]]
    )
    
    fwrite(
      tmp,
      paste0(
        '/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/',
        prscsx_dat$label[i],
        '.unrel.', j, '.txt'
      ),
      row.names = F,
      quote = F,
      na = 'NA',
      sep = '\t'
    )
    
    # Write out with row number based IDs
    pheno<-merge(tmp, psam, by='IID')
    pheno<-data.frame(
      FID=pheno$rn,
      IID=pheno$rn,
      outcome=pheno$outcome
    )
  
    fwrite(
      pheno,
      paste0(
        '/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/',
        prscsx_dat$label[i],
        '.unrel.', j, '.row_number.txt'
      ),
      row.names = F,
      quote = F,
      na = 'NA',
      sep = '\t'
    )
  }
}

# For the EUR training GWAS, normalise and regress covariates
# Use age, sex and PCs as covariates
# Read in PC data released by UKB
qc_dat<-bio_gen_sqc(project_dir)
qc_dat<-qc_dat[,c('eid',paste0('pc',1:20))]
df_short_unrel<-merge(df_short_unrel, qc_dat, by='eid')

# Read in sex and age information
system('rm /users/k1806347/oliverpainfel/Data/ukb/phenotypes/age_sex_field_subset.txt')
f <- bio_field(project_dir)
f %>%
    select(field, name) %>%
    filter(str_detect(field, "^21022-0.0|^31-0.0")) %>%
    bio_field_add("/users/k1806347/oliverpainfel/Data/ukb/phenotypes/age_sex_field_subset.txt")

bio_phen(
    project_dir,
    field = "/users/k1806347/oliverpainfel/Data/ukb/phenotypes/age_sex_field_subset.txt",
    out = "/users/k1806347/oliverpainfel/Data/ukb/phenotypes/age_sex_field_subset"
)

system("ls -lh /users/k1806347/oliverpainfel/Data/ukb/phenotypes/age_sex_field_subset.rds")
df <- readRDS("/users/k1806347/oliverpainfel/Data/ukb/phenotypes/age_sex_field_subset.rds")
names(df)<-gsub('-.*','',names(df))
names(df)[names(df) == '31']<-'sex'
names(df)[names(df) == '21022']<-'age'
df_short_unrel<-merge(df_short_unrel, df, by='eid')

# Within each population, normalise each outcome and regress out covariates
library(RNOmni)
covs<-c(paste0('pc',1:20), 'sex', 'age')
df_short_unrel_eur_train<-df_short_unrel[df_short_unrel$POP == 'EUR_train',]
for(i in 1:nrow(prscsx_dat)){
  tmp<-df_short_unrel_eur_train[!is.na(df_short_unrel_eur_train[[prscsx_dat$field[i]]]),]
  tmp$pheno_norm<-RNOmni::RankNorm(tmp[[prscsx_dat$field[i]]])
  mod<-lm(as.formula(paste0('pheno_norm ~ ', paste(covs, collapse=' + '))), data=tmp)
  tmp$pheno_norm_resid_scale<-as.numeric(scale(resid(mod)))
  tmp<-data.frame(
    FID=tmp$eid,
    IID=tmp$eid,
    outcome=tmp$pheno_norm_resid_scale
  )
  
  fwrite(
    tmp,
    paste0(
      '/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/',
      prscsx_dat$label[i],
      '.unrel.EUR_train.norm_resid_scale.txt'
    ),
    row.names = F,
    quote = F,
    na = 'NA',
    sep = '\t'
  )
}

# Convert to row number based IDs so it will work with UKB geno data from GenoPred
for(i in 1:nrow(prscsx_dat)){
  pheno<-fread(paste0(
      '/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/',
      prscsx_dat$label[i],
      '.unrel.EUR_train.norm_resid_scale.txt'
    ))
  
  pheno<-merge(pheno, psam, by='IID')
  pheno<-data.frame(
    FID=pheno$rn,
    IID=pheno$rn,
    outcome=pheno$outcome
  )
  
  fwrite(
    pheno,
    paste0(
      '/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/',
      prscsx_dat$label[i],
      '.unrel.EUR_train.norm_resid_scale.row_number.txt'
    ),
    row.names = F,
    quote = F,
    na = 'NA',
    sep = '\t'
  )
}

GWAS sumstats

We will generate EUR GWAS using the EUR training subset of UKB. BBJ will be used for EAS GWAS, and UGR will be used for AFR GWAS.

UKB GWAS

Show code

for pheno in $(cat /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/prscsx_labels.txt); do
  mkdir -p /users/k1806347/oliverpainfel/Data/ukb/gwas/${pheno}
  for chr in $(seq 1 22); do
      sbatch -p neurohack_cpu --wrap="/users/k1806347/oliverpainfel/Software/plink2 \
        --pfile /users/k1806347/oliverpainfel/Data/ukb/GenoPred/output/ukb/geno/ukb.ref.chr${chr} \
        --pheno /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/${pheno}.unrel.EUR_train.norm_resid_scale.row_number.txt \
        --linear omit-ref cols=+a1freq,+ax \
        --maf 0.01 \
        --geno 0.05 \
        --out /users/k1806347/oliverpainfel/Data/ukb/gwas/${pheno}/ukb.eur_train.${pheno}.chr${chr}"
  done
done

# Once complete, merge results across chromosomes
for pheno in $(cat /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/prscsx_labels.txt); do
  head -n 1 /users/k1806347/oliverpainfel/Data/ukb/gwas/${pheno}/ukb.eur_train.${pheno}.chr1.outcome.glm.linear > /users/k1806347/oliverpainfel/Data/ukb/gwas/${pheno}/ukb.eur_train.${pheno}.GW.txt
    for chr in $(seq 1 22); do
      tail -n +2 /users/k1806347/oliverpainfel/Data/ukb/gwas/${pheno}/ukb.eur_train.${pheno}.chr${chr}.outcome.glm.linear >> /users/k1806347/oliverpainfel/Data/ukb/gwas/${pheno}/ukb.eur_train.${pheno}.GW.txt
    done
    
    # Remove REF and ALT columns and rename AX column to A2
    cut -f 4,5 --complement /users/k1806347/oliverpainfel/Data/ukb/gwas/${pheno}/ukb.eur_train.${pheno}.GW.txt | awk 'BEGIN{FS=OFS="\t"} NR==1 {$5="A2"} 1' > temp.txt && mv temp.txt /users/k1806347/oliverpainfel/Data/ukb/gwas/${pheno}/ukb.eur_train.${pheno}.GW.txt

    gzip /users/k1806347/oliverpainfel/Data/ukb/gwas/${pheno}/ukb.eur_train.${pheno}.GW.txt
done

# Delete per chromosome files
rm /users/k1806347/oliverpainfel/Data/ukb/gwas/*/*chr*

Download BBJ sumstats

Show code

# Identify wget command for relevant phenotypes
library(data.table)

# Read in BBJ GWAS info from BBJ website
bbj_gwas<-fread('~/oliverpainfel/Data/GWAS_sumstats/BBJ/prscsx/bbj_gwas.csv')

# Map BBJ trait names to those used for UKB
bbj_gwas$bbj_labels <-
  gsub("\\)", '', gsub(".*\\(", '', bbj_gwas$Phenotype))
bbj_gwas$trait <- gsub(" \\(.*", '', bbj_gwas$Phenotype)

bbj_gwas$Category<-NULL
bbj_gwas$Phenotype<-NULL

# Update trait labels to match what was used in prscsx paper
bbj_gwas$trait<-gsub(' count','', bbj_gwas$trait)
bbj_gwas$trait[bbj_gwas$trait == 'G-glutamyl transpeptidase']<-'γ-glutamyl transpeptidase'

# Merge the bbj trait info with the prscsx trait info
prscsx_dat<-fread('/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/prscsx_data.csv')
prscsx_dat <- merge(bbj_gwas, prscsx_dat, by='trait', all=T)

write.csv(prscsx_dat, '~/oliverpainfel/Data/GWAS_sumstats/BBJ/prscsx/bbj_gwas_prscsx.csv', row.names = F)

# Create column showing what label is used in the wget command
prscsx_dat$wget_label <-
  gsub('.v1.zip', '', gsub('.*hum0197.v3.BBJ.', '', prscsx_dat$wget))

# Write a table showing label matching prscsx info and wget url
write.table(prscsx_dat[, c('labels', 'wget', 'wget_label'), with=F], '~/oliverpainfel/Data/GWAS_sumstats/BBJ/prscsx/bbj_gwas_wget.txt', col.names = F, row.names = F, quote = F)

# wget and unzip sumstats
for pheno in $(cat /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/prscsx_labels.txt); do
  url=$(awk -v var="$pheno" '$1 == var {print $2}' ~/oliverpainfel/Data/GWAS_sumstats/BBJ/prscsx/bbj_gwas_wget.txt)
  sbatch -p neurohack_cpu --wrap="wget -O /users/k1806347/oliverpainfel/Data/GWAS_sumstats/BBJ/prscsx/${pheno}.zip ${url}
    unzip /users/k1806347/oliverpainfel/Data/GWAS_sumstats/BBJ/prscsx/${pheno}.zip -d /users/k1806347/oliverpainfel/Data/GWAS_sumstats/BBJ/prscsx
    rm /users/k1806347/oliverpainfel/Data/GWAS_sumstats/BBJ/prscsx/${pheno}.zip"
done

# Delete X chromosome sumstats and rename files to be consistent with prscsx sumstat info
for pheno in $(cat /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/prscsx_labels.txt); do
  wget_label=$(awk -v var="$pheno" '$1 == var {print $3}' ~/oliverpainfel/Data/GWAS_sumstats/BBJ/prscsx/bbj_gwas_wget.txt)
if [ "$pheno" == "HT" ]; then
    mv ~/oliverpainfel/Data/GWAS_sumstats/BBJ/prscsx/hum0197.v3.BBJ.${wget_label}.v1/GWASsummary_Height_Japanese_SakaueKanai2020.auto.txt.gz ~/oliverpainfel/Data/GWAS_sumstats/BBJ/prscsx/bbj.HT.txt.gz
  else
    mv ~/oliverpainfel/Data/GWAS_sumstats/BBJ/prscsx/hum0197.v3.BBJ.${wget_label}.v1/GWASsummary_${wget_label}_Japanese_SakaueKanai2020.auto.txt.gz ~/oliverpainfel/Data/GWAS_sumstats/BBJ/prscsx/bbj.${pheno}.txt.gz
  fi
  rm -r ~/oliverpainfel/Data/GWAS_sumstats/BBJ/prscsx/hum0197.v3.BBJ.${wget_label}.v1
done

# Format so BOLT P value is used by GenoPred
for pheno in $(cat /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/prscsx_labels.txt); do
sbatch -p neurohack_cpu --wrap="/users/k1806347/oliverpainfel/Software/pigz/pigz -dc ~/oliverpainfel/Data/GWAS_sumstats/BBJ/prscsx/bbj.${pheno}.txt.gz | awk 'BEGIN {OFS=\"\t\"} {print \$2, \$3, \$4, \$6, \$7, \$8, \$9, \$12, \$13, \$15}' | sed '1s/P_BOLT_LMM_INF/P/' | /users/k1806347/oliverpainfel/Software/pigz/pigz -c > ~/oliverpainfel/Data/GWAS_sumstats/BBJ/prscsx/bbj.${pheno}.reformat.txt.gz"
done

Download UGR sumstats

Show code

# Identify wget command for relevant phenotypes
library(data.table)

# Read in UGR GWAS info from GWAS catalogue
ugr_gwas<-fread('~/oliverpainfel/Data/GWAS_sumstats/UGR/ugr_sumstats.csv')

# Map UGR trait names to those used for UKB
ugr_gwas$trait<-gsub(' levels','', ugr_gwas$reportedTrait)
ugr_gwas$trait<-gsub(' count','', ugr_gwas$trait)

ugr_to_prscsx <- c(
  "Aspartate aminotransferase" = "Aspartate transaminase",
  "Bilirubin" = NA,  # No direct match
  "Eosinophils" = "Eosinophil",
  "Gamma glutamyl transferase" = "γ-glutamyl transpeptidase",
  "HDL cholesterol" = "HDL-cholesterol",
  "Hemoglobin A1c" = "HbA1c",
  "Hip circumference" = NA,  # No direct match
  "LDL cholesterol" = "LDL-cholesterol",
  "Red cell distribution width" = NA,  # No direct match
  "Serum albumin" = "Albumin",
  "Serum alkaline phosphatase" = "Alkaline phosphatase",
  "Systolic blood pressure" = "Sytolic blood pressure",
  "Triglyceride" = "Triglycerides",
  "Waist circumference" = NA,  # No direct match
  "Waist-hip ratio" = NA,  # No direct match
  "Weight" = "Body weight"
)

ugr_gwas$trait <- ifelse(ugr_gwas$trait %in% names(ugr_to_prscsx),
                                   ugr_to_prscsx[ugr_gwas$trait],
                                   ugr_gwas$trait)

# Merge the ugr trait info with the prscsx trait info
prscsx_dat<-fread('/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/prscsx_data.csv')
prscsx_dat <- merge(ugr_gwas, prscsx_dat, by='trait')

write.csv(prscsx_dat, '~/oliverpainfel/Data/GWAS_sumstats/UGR/ugr_sumstats_prscsx.csv', row.names = F)

# Create column indicating wget command
prscsx_dat$wget<-NA
for(i in 1:nrow(prscsx_dat)){
  if(!grepl('.txt', prscsx_dat$wget[i])){
    print(i)
    Sys.sleep(2)
    log<-system(paste0('curl --max-time 10 ', gsub('http:','ftp:', prscsx_dat$summaryStatistics[i]), '/'), intern = T)
    log<-log[grepl('annotated.txt.gz|annotated.txt', log)]
    log<-gsub('.* ','', log)
    prscsx_dat$wget[i]<-paste0(prscsx_dat$summaryStatistics[i], '/', log)
  }
}
# Note this has to be run a few times due to some requests being blocked.

# Write a table showing label matching prscsx info and wget url
write.table(prscsx_dat[, c('labels', 'wget'), with=F], '~/oliverpainfel/Data/GWAS_sumstats/UGR/ugr_sumstats_wget.txt', col.names = F, row.names = F, quote = F)

# wget and unzip sumstats
for pheno in $(cat ~/oliverpainfel/Data/GWAS_sumstats/UGR/ugr_sumstats_wget.txt | cut -d' ' -f 1); do
  url=$(awk -v var="$pheno" '$1 == var {print $2}' ~/oliverpainfel/Data/GWAS_sumstats/UGR/ugr_sumstats_wget.txt)
  sbatch -p cpu --wrap="wget -O ~/oliverpainfel/Data/GWAS_sumstats/UGR/${pheno}.txt.gz ${url}"
done

library(future.batchtools)
library(furrr)
library(data.table)
ugr_data<-fread('~/oliverpainfel/Data/GWAS_sumstats/UGR/ugr_sumstats_prscsx.csv')

plan(batchtools_slurm(resources = list(
  time = "12:00:00",
  ntasks = 2,
  mem = "10g",
  partition = "neurohack_cpu"
)))

furrr::future_map_dfr(1:nrow(ugr_data), function(i) {
  print(i)
  sumstats <- fread(paste0("~/oliverpainfel/Data/GWAS_sumstats/UGR/", ugr_data$label[i], ".txt.gz"))
  sumstats <- sumstats[, names(sumstats) %in% c("snpid", "pval_fe", "se_fe") | grepl('^beta_|^af_|^no_', names(sumstats)), with=F]

  # Extract CHR, BP, A1, A2 from snpid
  snp_split <- tstrsplit(sumstats$snpid, ":", fixed = TRUE)
  sumstats[, `:=`(CHR = snp_split[[1]], BP = snp_split[[2]], A1 = snp_split[[3]], A2 = snp_split[[4]])]

  # Set no_ and af_ to NA if beta is NA
  cohorts <- gsub('^no_','', names(sumstats)[grepl('^no_', names(sumstats))])
  for (cohort in cohorts) {
    sumstats[[paste0('no_', cohort)]][is.na(sumstats[[paste0('beta_', cohort)]])] <- NA
    sumstats[[paste0('af_', cohort)]][is.na(sumstats[[paste0('beta_', cohort)]])] <- NA
  }

  # Calculate sample size weighted average for allele frequency
  for (cohort in cohorts) {
    sumstats[[paste0('af_', cohort, '_weighted')]] <- sumstats[[paste0('af_', cohort)]] * sumstats[[paste0('no_', cohort)]]
  }
  
  # Calculate total N and frequency
  sumstats[, N := rowSums(.SD, na.rm = TRUE), .SDcols = patterns("^no_")]
  sumstats[, FREQ := rowSums(.SD, na.rm = TRUE) / N, .SDcols = patterns("weighted$")]

  # Rename columns
  setnames(sumstats, old = c('beta_fe', 'se_fe', 'pval_fe'), new = c('BETA', 'SE', 'P'))

  # Select relevant columns and remove rows with missing data
  sumstats <- sumstats[, .(CHR, BP, A1, A2, BETA, SE, P, FREQ, N)]
  sumstats <- sumstats[complete.cases(sumstats)]
  
  fwrite(sumstats, paste0("~/oliverpainfel/Data/GWAS_sumstats/UGR/", ugr_data$label[i], ".reformat.txt.gz"), sep=' ', quote=F, na='NA')
  
})

Heritability and polygenicity estimation

We will estimate SNP-h2 using LD-score regression, and the rG using POPCORN. POPCORN can estimate the SNP-h2, but it will vary according to the other GWAS included due to SNP overlap. Use the sumstats QC’d by GenoPred. To estimate polygenicity, lets use AVENGEME based on ptclump score association results. Lets generate those using GenoPred.

QC GWAS sumstats

Use GenoPred for this.

Show code

Prepare configuration

######
# gwas_list
######

dir.create('/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop')

prscsx_dat<-fread('/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/prscsx_data.csv')

gwas_list_eur<-data.frame(
  name=paste0(prscsx_dat$labels,'_UKB'),
  path=paste0('/users/k1806347/oliverpainfel/Data/ukb/gwas/',prscsx_dat$labels,'/ukb.eur_train.',prscsx_dat$labels,'.GW.txt.gz'),
  population='EUR',
  n=NA,
  sampling=NA,
  prevalence=NA,
  mean=0,
  sd=1,
  label=paste0('"', prscsx_dat$trait, ' (UKB)"')
)

bbj_info<-fread('~/oliverpainfel/Data/GWAS_sumstats/BBJ/prscsx/bbj_gwas_prscsx.csv')
bbj_info<-bbj_info[bbj_info$labels %in% prscsx_dat$labels,]

gwas_list_eas<-data.frame(
  name=paste0(bbj_info$labels,'_BBJ'),
  path=paste0('/users/k1806347/oliverpainfel/Data/GWAS_sumstats/BBJ/prscsx/bbj.',bbj_info$labels,'.reformat.txt.gz'),
  population='EAS',
  n=as.numeric(gsub(',','',bbj_info$`No. samples`)),
  sampling=NA,
  prevalence=NA,
  mean=0,
  sd=1,
  label=paste0('"', prscsx_dat$trait, ' (BBJ)"')
)

ugr_data<-fread('~/oliverpainfel/Data/GWAS_sumstats/UGR/ugr_sumstats_prscsx.csv')
ugr_data<-ugr_data[ugr_data$labels %in% prscsx_dat$labels,]

gwas_list_afr<-data.frame(
  name=paste0(ugr_data$labels,'_UGR'),
  path=paste0('/users/k1806347/oliverpainfel/Data/GWAS_sumstats/UGR/',ugr_data$labels,'.reformat.txt.gz'),
  population='AFR',
  n=NA,
  sampling=NA,
  prevalence=NA,
  mean=0,
  sd=1,
  label=paste0('"', ugr_data$trait, ' (UGR)"')
)
gwas_list<-do.call(rbind, list(gwas_list_eur, gwas_list_eas, gwas_list_afr))

# Create file listing phenotypes in common between AFR, EAS and EUR
pheno <- gsub('_.*', '', gwas_list$name)
pheno_intersect <- Reduce(intersect, 
                           list(
                             pheno[gwas_list$population == 'EUR'],
                             pheno[gwas_list$population == 'EAS'],
                             pheno[gwas_list$population == 'AFR']
                             )
                           )

# Restrict gwas_list to intersecting phenotypes
gwas_list<-gwas_list[pheno %in% pheno_intersect,]

write.table(gwas_list, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_all.txt', col.names = T, row.names = F, quote = F)

write.table(pheno_intersect, '/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/pheno_eur_eas_afr.txt', col.names = F, row.names = F, quote = F)

######
# config
######

config<-c(
  "outdir: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/output",
  "config_file: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_all.yaml",
  "gwas_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_all.txt",
  "target_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/basic/target_list.txt",
  "pgs_methods: ['ptclump']",
  "cores_prep_pgs: 1",
  "cores_target_pgs: 20"
)

write.table(config, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_all.yaml', col.names = F, row.names = F, quote = F)

Run pipeline

snakemake \
  --profile slurm \
  --use-conda \
  --configfile=/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_all.yaml \
  target_pgs -n

Reformat for LDSC and POPCORN

library(data.table)
dir.create('/users/k1806347/oliverpainfel/Analyses/crosspop/popcorn/sumstats', recursive = T)
gwas_list<-fread('/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_all.txt')

for(i in 1:nrow(gwas_list)){
  if(
    file.exists(
      paste0(
        "/users/k1806347/oliverpainfel/Analyses/crosspop/popcorn/sumstats/",
        gwas_list$name[i], 
        ".sumstats.gz"))){
    next    
  }
  print(i)
  gwas_file <-
    paste0(
      "/users/k1806347/oliverpainfel/Data/ukb/GenoPred/output/reference/gwas_sumstat/",
      gwas_list$name[i],
      "/",
      gwas_list$name[i],
      "-cleaned.gz"
    )
  
  gwas_header <- fread(gwas_file, nrows = 1)
  cols_index <- which(names(gwas_header) %in% c('SNP','A1','A2','BETA','SE','P','N'))
  
  system(
    paste0(
      "zcat ",
      gwas_file,
      " | cut -f ", 
      paste0(cols_index, collapse = ','),
      " | sed -e '1s/BETA/beta/'",
      " | /users/k1806347/oliverpainfel/Software/pigz/pigz -f",
      " > /users/k1806347/oliverpainfel/Analyses/crosspop/popcorn/sumstats/",
      gwas_list$name[i], 
      ".sumstats.gz"
      )
    )
}

LDSC

Show code

conda activate ldsc

for pop in $(echo EUR EAS AFR);do
  if [ "$pop" == "EUR" ]; then
      samp="UKB"
  fi
  if [ "$pop" == "EAS" ]; then
      samp="BBJ"
  fi
  if [ "$pop" == "AFR" ]; then
      samp="UGR"
  fi
  
  for pheno in $(cat /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/pheno_eur_eas_afr.txt); do
    mkdir -p /users/k1806347/oliverpainfel/Analyses/crosspop/ldsc/sumstats

    sbatch --mem 10G -n 1 -p neurohack_cpu --wrap="/users/k1806347/oliverpainfel/Software/ldsc/munge_sumstats.py \
     --sumstats /users/k1806347/oliverpainfel/Analyses/crosspop/popcorn/sumstats/${pheno}_${samp}.sumstats.gz \
     --out /users/k1806347/oliverpainfel/Analyses/crosspop/ldsc/sumstats/${pheno}_${samp}"

  done
done

for pop in $(echo EUR EAS AFR);do
  if [ "$pop" == "EUR" ]; then
      samp="UKB"
  fi
  if [ "$pop" == "EAS" ]; then
      samp="BBJ"
  fi
  if [ "$pop" == "AFR" ]; then
      samp="UGR"
  fi
  
  for pheno in $(cat /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/pheno_eur_eas_afr.txt); do
    mkdir -p /users/k1806347/oliverpainfel/Analyses/crosspop/ldsc/results/${pheno}/${pop}

    sbatch --mem 10G -n 1 -p neurohack_cpu --wrap="/users/k1806347/oliverpainfel/Software/ldsc/ldsc.py \
     --h2 /users/k1806347/oliverpainfel/Analyses/crosspop/ldsc/sumstats/${pheno}_${samp}.sumstats.gz \
     --ref-ld /users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline/resources/data/ld_scores/UKBB.${pop}.rsid \
     --w-ld /users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline/resources/data/ld_scores/UKBB.${pop}.rsid \
     --out /users/k1806347/oliverpainfel/Analyses/crosspop/ldsc/results/${pheno}/${pop}/res"
     
  done
done

POPCORN

Show code

Calculate CSCOREs


# Subset the reference data into relevant populations
for pop in $(echo EUR EAS AFR); do
  mkdir -p /users/k1806347/oliverpainfel/Data/POPCORN/1KG/temp
  for chr in $(seq 1 22); do
    /users/k1806347/oliverpainfel/Software/plink2 \
      --pfile /users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline/resources/data/ref/ref.chr${chr} \
      --keep /users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline/resources/data/ref/keep_files/${pop}.keep \
      --make-bed \
      --out /users/k1806347/oliverpainfel/Data/POPCORN/1KG/temp/ref.${pop}.chr${chr}
    done
done

conda activate /scratch/prj/oliverpainfel/recovered/.conda/envs/popcorn
for pop in $(echo EAS AFR); do
  mkdir -p /users/k1806347/oliverpainfel/Data/POPCORN/1KG/EUR_${pop}_CSCORES
  for chr in $(seq 1 22); do
    sbatch --mem 10G -n 1 -p neurohack_cpu --wrap="popcorn \
      compute \
      -v 1 \
      --bfile1 /users/k1806347/oliverpainfel/Data/POPCORN/1KG/temp/ref.EUR.chr${chr} \
      --bfile2 /users/k1806347/oliverpainfel/Data/POPCORN/1KG/temp/ref.${pop}.chr${chr} \
      /users/k1806347/oliverpainfel/Data/POPCORN/1KG/EUR_${pop}_CSCORES/scores_chr${chr}.txt"
  done
done

for pop in $(echo EAS AFR); do
  cat /users/k1806347/oliverpainfel/Data/POPCORN/1KG/EUR_${pop}_CSCORES/scores_chr*.txt > /users/k1806347/oliverpainfel/Data/POPCORN/1KG/EUR_${pop}_CSCORES/scores_all.txt
done

rm -r /users/k1806347/oliverpainfel/Data/POPCORN/1KG/temp
rm /users/k1806347/oliverpainfel/Data/POPCORN/1KG/EUR_*_CSCORES/*chr*.txt

Run POPCORN

conda activate popcorn
for pop in $(echo EAS AFR);do
  if [ "$pop" == "EAS" ]; then
      samp="BBJ"
  fi
  if [ "$pop" == "AFR" ]; then
      samp="UGR"
  fi
  
  for pheno in $(cat /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/prscsx_labels.txt); do
    mkdir -p /users/k1806347/oliverpainfel/Analyses/crosspop/popcorn/results/${pheno}/EUR_${pop}
    sbatch --mem 10G -n 1 -p neurohack_cpu --wrap="popcorn \
       fit -v 3 \
       --cfile /users/k1806347/oliverpainfel/Data/POPCORN/1KG/EUR_${pop}_CSCORES/scores_all.txt \
       --sfile1 /users/k1806347/oliverpainfel/Analyses/crosspop/popcorn/sumstats/${pheno}_UKB.sumstats.gz \
       --sfile2 /users/k1806347/oliverpainfel/Analyses/crosspop/popcorn/sumstats/${pheno}_${samp}.sumstats.gz \
       --gen_effect \
       /users/k1806347/oliverpainfel/Analyses/crosspop/popcorn/results/${pheno}/EUR_${pop}/rG_gen_effect"
  done
done

Plot LDSC and POPCORN results

Show code

library(data.table)
library(ggplot2)
library(cowplot)

# Read in phenotypes
pheno_intersect <- read.table('/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/pheno_eur_eas_afr.txt', header=F)$V1

# Plot the heritability estimates
h2_res <- NULL

for(pop in c('AFR','EAS', 'EUR')){
  for(pheno in pheno_intersect){
    log <-
      readLines(
        paste0(
          '/users/k1806347/oliverpainfel/Analyses/crosspop/ldsc/results/',
          pheno,
          '/',
          pop,
          '/res.log'
        )
      )
    
    h2 <- log[grepl('Total Observed scale h2:', log)]
    h2_est <- as.numeric(gsub(' .*','', gsub('Total Observed scale h2: ', '', h2)))
    h2_se <- as.numeric(gsub("\\)",'', gsub(".* \\(", '', h2)))
    int <- log[grepl('Intercept:', log)]
    int_est <- as.numeric(gsub(' .*','', gsub('Intercept: ', '', int)))
    int_se <- as.numeric(gsub("\\)",'', gsub(".* \\(", '', int)))
    lambda <- log[grepl('Lambda GC:', log)]
    lambda <- as.numeric(gsub('.* ','', lambda))
    
    h2_res <- rbind(
      h2_res,
      data.table(
        Population = pop,
        Phenotype = pheno,
        h2_est = h2_est,
        h2_se = h2_se,
        int_est = int_est,
        int_se = int_se,
        lambda = lambda
      )
    )
  }
}

write.csv(h2_res, '/users/k1806347/oliverpainfel/Analyses/crosspop/ldsc/results.csv', row.names = F, quote = F)

png('~/oliverpainfel/Analyses/crosspop/plots/ldsc_h2.png', res = 100, width = 700, height = 300, units = 'px')
ggplot(h2_res, aes(x = Phenotype, y = h2_est, fill = Population)) +
  geom_bar(stat="identity", position=position_dodge(preserve = "single"), width = 0.7) +
  geom_errorbar(aes(ymin=h2_est-h2_se, ymax=h2_est+h2_se), width=.2, position=position_dodge(width = 0.7, preserve = "single")) +
  labs(y="SNP-based Heritability (SE)", fill = NULL) +
  theme_half_open() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "top",
        legend.key.spacing.x = unit(2, "cm"),
        legend.justification = "center") +
  background_grid(major = 'y', minor = 'y')
dev.off()

# Plot rG estimates
rg_res <- NULL
for(pop in c('AFR','EAS')){
  for(pheno in h2_res$Phenotype){
    pop_res_i<-fread(paste0('/users/k1806347/oliverpainfel/Analyses/crosspop/popcorn/results/', pheno, '/EUR_', pop, '/rG_gen_effect'))
    names(pop_res_i) <- c('Test','Estimate','SE','Z','P')
    pop_res_i <- pop_res_i[pop_res_i$Test == 'pge',]
    pop_res_i$Population_1 <- 'EUR'
    pop_res_i$Population_2 <- pop
    pop_res_i$Phenotype <- pheno
    rg_res <- rbind(rg_res, pop_res_i)
  }
}

rg_res$Comparison <- paste0(rg_res$Population_1, ' vs. ', rg_res$Population_2)

write.csv(rg_res, '/users/k1806347/oliverpainfel/Analyses/crosspop/popcorn/results.csv', row.names = F, quote = F)

png('~/oliverpainfel/Analyses/crosspop/plots/popcorn_rg.png', res = 100, width = 700, height = 300, units = 'px')
ggplot(rg_res, aes(x = Phenotype, y = Estimate, fill = Comparison)) +
  geom_bar(stat="identity", position=position_dodge(), width = 0.7) +
  geom_errorbar(aes(ymin=Estimate-SE, ymax=Estimate+SE), width=.2, position=position_dodge(width = 0.7)) +
  labs(y="SNP-based\nGenetic Correlation (SE)", fill = NULL) +
  theme_half_open() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "top",
        legend.key.spacing.x = unit(2, "cm"),
        legend.justification = "center") +
  background_grid(major = 'y', minor = 'y')
dev.off()

Show LDSC SNP-heritability

Show POPCORN genetic correlation estimates

AVENGEME

Show code

Create predictor list

setwd('~/oliverpainfel/Software/MyGit/GenoPred/pipeline/')
source('../functions/misc.R')
source_all('../functions')
library(data.table)

# Get some key variables from config
config<-'/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_all.yaml'
pgs_methods <- read_param(config = config, param = 'pgs_methods', return_obj = F)
outdir <- read_param(config = config, param = 'outdir', return_obj = F)

# Get a list of score files
scores <- list_score_files(config)

# Read in phenotypes
pheno_intersect <- read.table('/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/pheno_eur_eas_afr.txt', header=F)$V1

# Create files for EAS and AFR targets
pop <- c('EUR','EAS','AFR')
for(trait_i in pheno_intersect){
  # Make a group containing both GWAS for each single source method
  # Make a group for each multisource method
  scores_i <- scores[grepl(paste0('^', trait_i, '_'), scores$name),]
  scores_i$group <- scores_i$method
  
  for(pop_i in pop){
    # Subset GWAS based on EUR and/or targ_pop_i
    if(pop_i == 'EAS'){
      samp_i <- 'BBJ'
    }
    if(pop_i == 'AFR'){
      samp_i <- 'UGR'
    }
    if(pop_i == 'EUR'){
      samp_i <- c('UKB')
    }

    dir.create(
      paste0(
        '/users/k1806347/oliverpainfel/Analyses/crosspop/targ_',
        pop_i,
        '.disc_',
        pop_i,
        '/',
        trait_i
      ),
      recursive = T
    )
    
    scores_i_j <- scores_i[grepl(samp_i, scores_i$name, ignore.case = T),]
    scores_i_j$predictor <- paste0(
      outdir,
      '/ukb/pgs/TRANS/',
      scores_i_j$method,
      '/',
      scores_i_j$name,
      '/ukb-',
      scores_i_j$name,
      '-TRANS.profiles'
    )
    
    predictors_i <- scores_i_j[, c('predictor', 'group'), with=F]
    
    write.table(
      predictors_i,
      paste0(
        '/users/k1806347/oliverpainfel/Analyses/crosspop/targ_',
        pop_i,
        '.disc_',
        pop_i,
        '/',
        trait_i,
        '/predictor_list.ptclump.txt'
      ),
      col.names = T,
      row.names = F,
      quote = F
    )
  }
}

Run model_builder

cd /users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline
conda activate model_builder

for pop in $(echo EUR EAS AFR); do
  if [ "$pop" == "EUR" ]; then
      pop2="EUR_test"
  else
      pop2=$pop
  fi
  
  for pheno in $(cat /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/pheno_eur_eas_afr.txt); do
    sbatch --mem 5G -n 5 -p neurohack_cpu --wrap="Rscript ../Scripts/model_builder/model_builder.R \
      --outcome /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/${pheno}.unrel.${pop2}.row_number.txt \
      --predictors /users/k1806347/oliverpainfel/Analyses/crosspop/targ_${pop}.disc_${pop}/${pheno}/predictor_list.ptclump.txt \
      --out /users/k1806347/oliverpainfel/Analyses/crosspop/targ_${pop}.disc_${pop}/${pheno}/res.ptclump \
      --n_core 5 \
      --all_model F \
      --assoc T"
  done
done

Plot pT+clump association results

setwd('/users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline/')

library(data.table)
library(ggplot2)
library(cowplot)

source('../functions/misc.R')
source_all('../functions')

# Read in phenotypes
pheno_intersect <- read.table('/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/pheno_eur_eas_afr.txt', header=F)$V1

# Read in results
pop = c('EUR','EAS','AFR')
res_all <- NULL
for(pheno_i in pheno_intersect){
  res_i<-NULL
  for(pop_i in pop){
    assoc_i <-
      fread(
        paste0(
          '/users/k1806347/oliverpainfel/Analyses/crosspop/targ_',
          pop_i,
          '.disc_',
          pop_i,
          '/',
          pheno_i,
          '/res.ptclump.assoc.txt'
        )
      )
      assoc_i$Population <- pop_i
      res_i<-rbind(res_i, assoc_i)
  }
  
  res_i$Phenotype <- pheno_i
  res_all<-rbind(res_all, res_i)
}

# Extract pT variable from Predictor
res_all$pT <- gsub('e.','e-', gsub('.*UKB\\.0\\.|.*BBJ\\.0\\.|.*UGR\\.0\\.', '', res_all$Predictor))
res_all$pT <- factor(res_all$pT, levels = unique(res_all$pT))

png('~/oliverpainfel/Analyses/crosspop/plots/ptclump_assoc.png', res = 100, width = 900, height = 500, units = 'px')
ggplot(res_all, aes(x = Phenotype, y = BETA, fill = pT)) +
  geom_hline(yintercept = 0, colour = 'darkgrey') +
  geom_bar(stat="identity", position=position_dodge(preserve = "single"), width = 0.8) +
  geom_errorbar(aes(ymin=BETA-SE, ymax=BETA+SE), width=0, position=position_dodge(width = 0.8, preserve = "single")) +
  labs(y="BETA (SE)") +
  theme_half_open() +
  background_grid() +
  panel_border() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "top",
        legend.key.spacing.x = unit(1, "cm"),
        legend.justification = "center") +
  background_grid(major = 'y', minor = 'y') +
  scale_fill_manual(values = colorRampPalette(c("lightblue", "darkblue"))(length(unique(res_all$pT)))) +
  facet_grid(Population ~.)
dev.off()

Run AVENGEME

setwd('/users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline/')

library(data.table)
library(ggplot2)
library(cowplot)
library(avengeme)

source('../functions/misc.R')
source_all('../functions')

# Get some key variables from config
config<-'/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_all.yaml'
outdir <- read_param(config = config, param = 'outdir', return_obj = F)
gwas_list <- read_param(config = config, param = 'gwas_list', return_obj = T)

# Read in phenotypes
pheno_intersect <- read.table('/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/pheno_eur_eas_afr.txt', header=F)$V1

pop = c('EUR','EAS','AFR')

mod_res_all <- NULL
for(pop_i in pop){
  for(pheno_i in pheno_intersect){
    gwas_i<-gwas_list$name[gwas_list$population == pop_i & grepl(paste0('^', pheno_i, '_'),  gwas_list$name)]
      
    res_i <-
      fread(
        paste0(
          '/users/k1806347/oliverpainfel/Analyses/crosspop/targ_',
          pop_i,
          '.disc_',
          pop_i,
          '/',
          pheno_i,
          '/res.ptclump.assoc.txt'
        )
      )
    
    res_i$Z <- res_i$BETA / res_i$SE
    
    res_i$pT <- as.numeric(gsub('e.','e-', gsub('.*UKB\\.0\\.|.*BBJ\\.0\\.|.*UGR\\.0\\.', '', res_i$Predictor)))

    nsnp_log <-
      read.table(
        paste0(
          outdir,
          '/reference/pgs_score_files/ptclump/',
          gwas_i,
          '/ref-',
          gwas_i,
          '.NSNP_per_pT'
        ),
        header = T
      )
    
    nsnp<-nsnp_log$NSNP[nrow(nsnp_log)]
    
    disc_N <-
      median(
        fread(
          paste0(
            outdir,
            '/reference/gwas_sumstat/',
            gwas_i,
            '/',
            gwas_i,
            '-cleaned.gz'
          ), nrows = 10000
        )$N
      )
    
    targ_N <- res_i$N[1]
    
    mod_res <- estimatePolygenicModel(
      p = res_i$Z,
      nsnp = nsnp,
      n = c(disc_N, targ_N),
      pupper = c(0, res_i$pT),
      fixvg2pi02 = T,
      alpha = 0.05
    )
    
    mod_res_all <- rbind(
      mod_res_all,
      data.frame(
        Phenotype = pheno_i,
        Population = pop_i,
        GWAS = gwas_i,
        nsnp = nsnp,
        max_r2 = max(res_i$Obs_R2),
        n_disc = disc_N,
        n_targ = targ_N,
        vg_est = mod_res$vg[1],
        vg_lowCI = mod_res$vg[2],
        vg_highCI = mod_res$vg[3],
        pi0_est = mod_res$pi0[1],
        pi0_lowCI = mod_res$pi0[2],
        pi0_highCI = mod_res$pi0[3]
      )
    )
  }
}

dir.create('/users/k1806347/oliverpainfel/Analyses/crosspop/avengeme')
write.csv(mod_res_all, '/users/k1806347/oliverpainfel/Analyses/crosspop/avengeme/results.csv', row.names = F, quote = F)

mod_res_all<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/avengeme/results.csv')

png('~/oliverpainfel/Analyses/crosspop/plots/avengeme_h2.png', res = 100, width = 900, height = 500, units = 'px')
ggplot(mod_res_all, aes(x = Phenotype, y = vg_est, fill = Population)) +
  geom_bar(stat="identity", position=position_dodge(preserve = "single"), width = 0.7) +
  geom_errorbar(aes(ymin=vg_lowCI, ymax=vg_highCI), width=.2, position=position_dodge(width = 0.7, preserve = "single")) +
  labs(y="SNP-based Heritability (95%CI)", fill = NULL) +
  theme_half_open() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "top",
        legend.key.spacing.x = unit(1, "cm"),
        legend.justification = "center") +
  background_grid(major = 'y', minor = 'y')
dev.off()

png('~/oliverpainfel/Analyses/crosspop/plots/avengeme_polygenicity.png', res = 100, width = 900, height = 500, units = 'px')
ggplot(mod_res_all, aes(x = Phenotype, y = 1 - pi0_est, fill = Population)) +
  geom_bar(stat="identity", position=position_dodge(preserve = "single"), width = 0.7) +
  geom_errorbar(aes(ymin=1 - pi0_lowCI, ymax=1 - pi0_highCI), width=.2, position=position_dodge(width = 0.7, preserve = "single")) +
  labs(y="Proporition non-zero\neffects (95%CI)", fill = NULL) +
  theme_half_open() +
  coord_cartesian(ylim = c(0, 0.15)) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "top",
        legend.key.spacing.x = unit(1, "cm"),
        legend.justification = "center") +
  background_grid(major = 'y', minor = 'y')
dev.off()

summary(mod_res_all$max_r2)
summary(mod_res_all$max_r2[mod_res_all$Population == 'EUR'])
summary(mod_res_all$max_r2[mod_res_all$Population == 'EAS'])
summary(mod_res_all$max_r2[mod_res_all$Population == 'AFR'])

Show AVENGEME results

Select traits

Here we will identify a list of traits that fulfill our selection criteria, and that represent a range of heritability and polygenicity combinations.

Show code

#########
# Select 10 GWAS for downstream analysis
#########
# Criteria are that SNP-h2 > 0.01 in both AVENGEME and LDSC
# Then GWAS are selected to represent a range of polygenicity and heritability, as estimated in EUR since they are most accurate

library(data.table)

# Read in the AVENGEME results
avengeme <- fread('/users/k1806347/oliverpainfel/Analyses/crosspop/avengeme/results.csv')

# Read in the LDSC results
ldsc <- fread('/users/k1806347/oliverpainfel/Analyses/crosspop/ldsc/results.csv')

# Combine results
both <- merge(avengeme, ldsc, by = c('Population','Phenotype'))

# Remove GWAS that have negative SNP-h2 from LDSC in any population
both_h2 <- both[!(both$Phenotype %in% both$Phenotype[both$vg_est < 0.01 | both$h2_est < 0.01]),]

# Select GWAS representing a range of SNP-h2 from LDSC, and a range of polygenicity from AVENGEME.
both_eur<-both_h2[both_h2$Population == 'EUR',]
traits_data <- data.frame(trait = both_eur$Phenotype, heritability = both_eur$vg_est, polygenicity = both_eur$pi0_est)

# Number of bins (e.g., dividing into 5 bins each for heritability and polygenicity)
num_bins <- 5

# Create bins
traits_data$her_bin <- cut(traits_data$heritability, breaks = num_bins)
traits_data$poly_bin <- cut(traits_data$polygenicity, breaks = num_bins)

# Split data by unique bin combinations
split_data <- split(traits_data, list(traits_data$her_bin, traits_data$poly_bin), drop = TRUE)

set.seed(1)
# Randomly select one trait from each bin combination
selected_traits <- do.call(rbind, lapply(split_data, function(df) df[sample(nrow(df), 1), ]))

# Limit to 10 traits if more than 10 unique combinations
if (nrow(selected_traits) > 10) {
  selected_traits <- selected_traits[sample(nrow(selected_traits), 10), ]
}

write.table(selected_traits$trait, '/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', col.names = F, row.names = F, quote = F)

# Plot max R2 for selected traits
mod_res_all <- fread('/users/k1806347/oliverpainfel/Analyses/crosspop/avengeme/results.csv')
mod_res_all_selected <- mod_res_all[mod_res_all$Phenotype %in% selected_traits$trait,]

ggplot(mod_res_all_selected, aes(x = Phenotype, y = max_r2, fill = Population)) +
  geom_bar(stat="identity", position=position_dodge(preserve = "single"), width = 0.7) +
  labs(y="Max R2") +
  theme_half_open() +
  coord_cartesian(ylim = c(0, 0.15)) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  background_grid(major = 'y', minor = 'y')

# 
hist(mod_res_all$max_r2)
hist(mod_res_all$max_r2[mod_res_all$Population == 'EUR'])
hist(mod_res_all$max_r2[mod_res_all$Population == 'EAS'])
hist(mod_res_all$max_r2[mod_res_all$Population == 'AFR'])

summary(mod_res_all$max_r2)
summary(mod_res_all$max_r2[mod_res_all$Population == 'EUR'])
summary(mod_res_all$max_r2[mod_res_all$Population == 'EAS'])
summary(mod_res_all$max_r2[mod_res_all$Population == 'AFR'])

round(sqrt(min(mod_res_all$max_r2[mod_res_all$Population == 'EUR'])), 2)
round(sqrt(max(mod_res_all$max_r2[mod_res_all$Population == 'EUR'])), 2)
round(sqrt(min(mod_res_all$max_r2[mod_res_all$Population == 'EAS'])), 2)
round(sqrt(max(mod_res_all$max_r2[mod_res_all$Population == 'EAS'])), 2)
round(sqrt(min(mod_res_all$max_r2[mod_res_all$Population == 'AFR'])), 2)
round(sqrt(max(mod_res_all$max_r2[mod_res_all$Population == 'AFR'])), 2)

GWAS descriptives

Make a table showing GWAS information for the manuscript.

Show code

library(data.table)

#####
# Trait names, labels, and URLs
#####

###
# UKB
###
ukb <- fread('/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/prscsx_data.csv')
names(ukb) <- c('trait', 'labels','field')
trait_labels <- ukb[, c('trait','labels'), with=F]
ukb<-ukb[, c('trait','field'), with=F]
ukb$sample <- 'UKB'
ukb$population <- 'EUR'
ukb$url<-NA

###
# BBJ
###
bbj <- fread('~/oliverpainfel/Data/GWAS_sumstats/BBJ/prscsx/bbj_gwas_prscsx.csv')
bbj <- bbj[, c('trait', 'wget'), with = F]
names(bbj) <- c('trait', 'url')
bbj$sample <- 'BBJ'
bbj$population <- 'EAS'
bbj$field <- NA

###
# UGR
###
ugr <- fread('~/oliverpainfel/Data/GWAS_sumstats/UGR/ugr_sumstats_prscsx.csv')
ugr <- ugr[, c('trait', 'summaryStatistics'), with = F]
names(ugr) <- c('trait','url')
ugr$sample <- 'UGR'
ugr$population <- 'AFR'
ugr$field <- NA

info_all <- do.call(rbind, list(ukb, bbj, ugr))
info_all<-merge(info_all, trait_labels, by='trait')

#####
# Sample size, SNP-h2 and polygenicity
#####

# Read in the AVENGEME and LDSC results
avengeme <- fread('/users/k1806347/oliverpainfel/Analyses/crosspop/avengeme/results.csv')
ldsc <- fread('/users/k1806347/oliverpainfel/Analyses/crosspop/ldsc/results.csv')
both <- merge(avengeme, ldsc, by = c('Population','Phenotype'))

# Format for descriptives table
both$h2_avengeme<- paste0(
  round(both$vg_est,2), 
  " (95%CI = ", 
  round(both$vg_lowCI, 2), 
  " - " , 
  round(both$vg_highCI, 2), ")")

both$pi0_avengeme <- paste0(
  round(both$pi0_est,2), 
  " (95%CI = ", 
  round(both$pi0_lowCI, 2), 
  " - " , 
  round(both$pi0_highCI, 2), ")")

both$h2_ldsc <- paste0(
  round(both$h2_est,2), 
  " (SE = ", 
  round(both$h2_se, 2), 
  ")")

both$int_ldsc <- paste0(
  round(both$int_est,2), 
  " (SE = ", 
  round(both$int_se, 2), 
  ")")

both<-both[, c('Population','Phenotype','n_disc','n_targ','h2_avengeme','pi0_avengeme','h2_ldsc','int_ldsc','lambda'), with = F]
names(both)[1:2]<-c('population','labels')

info_all <- merge(info_all, both, by = c('labels','population'))
info_all$n_disc<-round(info_all$n_disc, 0)
info_all$n_targ<-round(info_all$n_targ, 0)

info_all<-info_all[, c('labels','trait','population','sample','n_disc','n_targ','h2_avengeme','pi0_avengeme','h2_ldsc','int_ldsc','lambda','field','url'), with=F]
names(info_all) <- c('Trait Label', 'Trait Description', 'Ancestry', 'GWAS Sample', 'GWAS N', 'Target N',"SNP-h2 (AVENGEME)","pi0 (AVENGEME)","SNP-h2 (LDSC)","Intercept (LDSC)",'Lambda', 'UKB Field', 'URL')

# Add in column indicating whether the trait was used in downstream PGS comparison
selected_traits <- fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', header=F)$V1

info_all$`Selected` <- info_all$`Trait Label` %in% selected_traits

write.csv(info_all, '/users/k1806347/oliverpainfel/Analyses/crosspop/gwas_descriptives.csv', row.names=F)

# Estimate the mean and SD of sample size within each population for selected traits
info_all_selected<-info_all[info_all$Selected == T,]
n_dat <- NULL
for(i in unique(info_all_selected$`GWAS Sample`)){
  n_dat <-rbind(
    n_dat,
    data.table(
      sample = i,
      gwas_n_median = round(median(info_all_selected$`GWAS N`[info_all_selected$`GWAS Sample` == i])),
      gwas_n_mean = round(mean(info_all_selected$`GWAS N`[info_all_selected$`GWAS Sample` == i])),
      gwas_n_sd = round(sd(info_all_selected$`GWAS N`[info_all_selected$`GWAS Sample` == i])),
      target_n_median = round(median(info_all_selected$`Target N`[info_all_selected$`GWAS Sample` == i])),
      target_n_mean = round(mean(info_all_selected$`Target N`[info_all_selected$`GWAS Sample` == i])),
      target_n_sd = round(sd(info_all_selected$`Target N`[info_all_selected$`GWAS Sample` == i]))
    )
  )
}

Show descriptives table

Trait Label	Trait Description	Ancestry	GWAS Sample	GWAS N	Target N	SNP-h2 (AVENGEME)	pi0 (AVENGEME)	SNP-h2 (LDSC)	Intercept (LDSC)	Lambda	UKB Field	URL	Selected
ALB	Albumin	AFR	UGR	13125	5868	0.14 (95%CI = 0.1 - 0.19)	0.98 (95%CI = 0 - 0.99)	0.07 (SE = 0.05)	1.01 (SE = 0.01)	1.0135	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009048	FALSE
ALB	Albumin	EAS	BBJ	120539	1780	0.06 (95%CI = 0.04 - 0.07)	0.98 (95%CI = 0.97 - 0.99)	0.06 (SE = 0.01)	1.03 (SE = 0.01)	1.1459	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.Alb.v1.zip	FALSE
ALB	Albumin	EUR	UKB	268088	67382	0.13 (95%CI = 0.13 - 0.13)	0.94 (95%CI = 0.94 - 0.94)	0.12 (SE = 0.01)	1.13 (SE = 0.02)	1.4281	30600	NA	FALSE
ALP	Alkaline phosphatase	AFR	UGR	9322	6328	0.03 (95%CI = 0.02 - 0.06)	1 (95%CI = 1 - 1)	0.09 (SE = 0.07)	1.01 (SE = 0.01)	1.0255	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009049	FALSE
ALP	Alkaline phosphatase	EAS	BBJ	118886	1955	0.12 (95%CI = 0.1 - 0.14)	0.98 (95%CI = 0.97 - 0.99)	0.12 (SE = 0.03)	1.09 (SE = 0.01)	1.2005	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.ALP.v1.zip	FALSE
ALP	Alkaline phosphatase	EUR	UKB	292054	73314	0.16 (95%CI = 0.16 - 0.16)	0.94 (95%CI = 0.94 - 0.94)	0.21 (SE = 0.03)	1.15 (SE = 0.02)	1.5144	30610	NA	FALSE
ALT	Alanine aminotransferase	AFR	UGR	9401	6326	0.07 (95%CI = 0.03 - 0.12)	1 (95%CI = 1 - 1)	0.11 (SE = 0.07)	1 (SE = 0.01)	1.0165	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009047	FALSE
ALT	Alanine aminotransferase	EAS	BBJ	150545	1954	0.03 (95%CI = 0.02 - 0.04)	0.99 (95%CI = 0.98 - 1)	0.06 (SE = 0.01)	1.06 (SE = 0.01)	1.1459	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.ALT.v1.zip	FALSE
ALT	Alanine aminotransferase	EUR	UKB	291932	73278	0.09 (95%CI = 0.08 - 0.09)	0.95 (95%CI = 0.95 - 0.95)	0.11 (SE = 0.01)	1.09 (SE = 0.02)	1.4069	30620	NA	FALSE
AST	Aspartate transaminase	AFR	UGR	8995	6289	0.03 (95%CI = 0 - 0.09)	1 (95%CI = 0 - 1)	0.12 (SE = 0.07)	1.01 (SE = 0.01)	1.0165	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009046	FALSE
AST	Aspartate transaminase	EAS	BBJ	150068	1940	0.01 (95%CI = 0 - 0.02)	1 (95%CI = 0.99 - 1)	0.07 (SE = 0.01)	1.06 (SE = 0.01)	1.1459	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.AST.v1.zip	FALSE
AST	Aspartate transaminase	EUR	UKB	291016	73049	0.08 (95%CI = 0.07 - 0.08)	0.95 (95%CI = 0.95 - 0.95)	0.12 (SE = 0.01)	1.11 (SE = 0.02)	1.3964	30650	NA	FALSE
BAS	Basophil	AFR	UGR	2681	6353	0 (95%CI = 0 - 0.02)	1 (95%CI = 0.99 - 1)	-0.37 (SE = 0.25)	1.02 (SE = 0.01)	1.0046	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009039	FALSE
BAS	Basophil	EAS	BBJ	91908	1997	0 (95%CI = 0 - 0.02)	1 (95%CI = 0.81 - 1)	0.07 (SE = 0.02)	1.06 (SE = 0.01)	1.0957	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.BAS.v1.zip	FALSE
BAS	Basophil	EUR	UKB	296570	74442	0.04 (95%CI = 0.04 - 0.04)	0.98 (95%CI = 0.98 - 0.98)	0.04 (SE = 0)	1.06 (SE = 0.01)	1.1811	30160	NA	FALSE
BMI	Body mass index	AFR	UGR	13976	6646	0.13 (95%CI = 0.09 - 0.18)	0.97 (95%CI = 0 - 0.99)	0.11 (SE = 0.05)	1.02 (SE = 0.01)	1.0315	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009057	TRUE
BMI	Body mass index	EAS	BBJ	163835	2046	0.17 (95%CI = 0.15 - 0.18)	0.91 (95%CI = 0.88 - 0.93)	0.18 (SE = 0.01)	1.14 (SE = 0.02)	1.4926	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.BMI.v1.zip	TRUE
BMI	Body mass index	EUR	UKB	304506	76393	0.22 (95%CI = 0.21 - 0.22)	0.85 (95%CI = 0.85 - 0.86)	0.23 (SE = 0.01)	1.14 (SE = 0.02)	2.0302	21001	NA	TRUE
BWT	Body weight	AFR	UGR	14005	6659	0.11 (95%CI = 0.08 - 0.15)	0.99 (95%CI = 0.92 - 0.99)	0.13 (SE = 0.05)	1.01 (SE = 0.01)	1.0255	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009056	TRUE
BWT	Body weight	EAS	BBJ	165419	2046	0.14 (95%CI = 0.12 - 0.16)	0.91 (95%CI = 0.89 - 0.94)	0.22 (SE = 0.01)	1.17 (SE = 0.02)	1.6259	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.BW.v1.zip	TRUE
BWT	Body weight	EUR	UKB	304622	76415	0.2 (95%CI = 0.2 - 0.2)	0.87 (95%CI = 0.87 - 0.88)	0.25 (SE = 0.01)	1.16 (SE = 0.02)	2.0556	21002	NA	TRUE
DBP	Diastolic blood pressure	AFR	UGR	13618	6658	0.23 (95%CI = 0.19 - 0.27)	0.96 (95%CI = 0.74 - 0.98)	-0.02 (SE = 0.05)	1.02 (SE = 0.01)	1.0165	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009052	FALSE
DBP	Diastolic blood pressure	EAS	BBJ	145515	1976	0.09 (95%CI = 0.07 - 0.1)	0.93 (95%CI = 0.89 - 0.96)	0.05 (SE = 0.01)	1.07 (SE = 0.01)	1.1459	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.DBP.v1.zip	FALSE
DBP	Diastolic blood pressure	EUR	UKB	289698	72640	0.13 (95%CI = 0.12 - 0.13)	0.89 (95%CI = 0.89 - 0.9)	0.12 (SE = 0.01)	1.08 (SE = 0.02)	1.5548	4079	NA	FALSE
EOS	Eosinophil	AFR	UGR	2671	6353	0 (95%CI = 0 - 0.02)	1 (95%CI = 0.99 - 1)	-0.26 (SE = 0.25)	1.02 (SE = 0.01)	1.0195	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009041	FALSE
EOS	Eosinophil	EAS	BBJ	93063	1997	0.08 (95%CI = 0.07 - 0.1)	0.98 (95%CI = 0.97 - 0.99)	0.09 (SE = 0.01)	1.06 (SE = 0.01)	1.1459	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.EOS.v1.zip	FALSE
EOS	Eosinophil	EUR	UKB	296570	74442	0.15 (95%CI = 0.15 - 0.16)	0.93 (95%CI = 0.93 - 0.94)	0.19 (SE = 0.02)	1.17 (SE = 0.02)	1.4962	30150	NA	FALSE
GGT	γ-glutamyl transpeptidase	AFR	UGR	8995	6322	0.05 (95%CI = 0.03 - 0.08)	1 (95%CI = 1 - 1)	0.23 (SE = 0.07)	0.99 (SE = 0.01)	1.0075	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009050	FALSE
GGT	γ-glutamyl transpeptidase	EAS	BBJ	133471	1952	0.08 (95%CI = 0.06 - 0.09)	0.97 (95%CI = 0.95 - 0.98)	0.15 (SE = 0.05)	1.07 (SE = 0.01)	1.2005	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.GGT.v1.zip	FALSE
GGT	γ-glutamyl transpeptidase	EUR	UKB	291885	73270	0.1 (95%CI = 0.1 - 0.1)	0.95 (95%CI = 0.95 - 0.95)	0.18 (SE = 0.01)	1.16 (SE = 0.02)	1.5696	30730	NA	FALSE
HB	Hemoglobin	AFR	UGR	2741	6375	0.25 (95%CI = 0.16 - 0.36)	0.97 (95%CI = 0 - 1)	0.22 (SE = 0.25)	1 (SE = 0.01)	1.0105	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009034	TRUE
HB	Hemoglobin	EAS	BBJ	152447	1999	0.06 (95%CI = 0.05 - 0.08)	0.95 (95%CI = 0.89 - 0.97)	0.07 (SE = 0.01)	1.09 (SE = 0.01)	1.2005	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.Hb.v1.zip	TRUE
HB	Hemoglobin	EUR	UKB	297083	74575	0.13 (95%CI = 0.13 - 0.13)	0.92 (95%CI = 0.92 - 0.92)	0.15 (SE = 0.01)	1.16 (SE = 0.02)	1.5217	30020	NA	TRUE
HCT	Hematocrit	AFR	UGR	2744	6375	0.26 (95%CI = 0.16 - 0.36)	0.88 (95%CI = 0 - 1)	0.18 (SE = 0.26)	1.01 (SE = 0.01)	1.0105	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009033	FALSE
HCT	Hematocrit	EAS	BBJ	153015	1999	0.05 (95%CI = 0.03 - 0.06)	0.97 (95%CI = 0.94 - 0.98)	0.07 (SE = 0.01)	1.09 (SE = 0.01)	1.2005	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.Ht.v1.zip	FALSE
HCT	Hematocrit	EUR	UKB	297084	74575	0.13 (95%CI = 0.13 - 0.13)	0.92 (95%CI = 0.92 - 0.93)	0.14 (SE = 0.01)	1.16 (SE = 0.02)	1.5035	30030	NA	FALSE
HDL	HDL-cholesterol	AFR	UGR	13114	5863	0.06 (95%CI = 0.04 - 0.09)	1 (95%CI = 1 - 1)	0.02 (SE = 0.05)	1.02 (SE = 0.01)	1.0225	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009044	TRUE
HDL	HDL-cholesterol	EAS	BBJ	74970	1783	0.15 (95%CI = 0.13 - 0.17)	0.98 (95%CI = 0.97 - 0.99)	0.17 (SE = 0.03)	1.09 (SE = 0.01)	1.2005	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.HDLC.v1.zip	TRUE
HDL	HDL-cholesterol	EUR	UKB	267973	67346	0.2 (95%CI = 0.2 - 0.2)	0.92 (95%CI = 0.92 - 0.93)	0.23 (SE = 0.02)	1.15 (SE = 0.02)	1.6524	30760	NA	TRUE
HT	Height	AFR	UGR	14126	6658	0.23 (95%CI = 0.19 - 0.26)	0.98 (95%CI = 0.96 - 0.99)	0.12 (SE = 0.06)	1.02 (SE = 0.01)	1.0345	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009055	TRUE
HT	Height	EAS	BBJ	165056	2048	0.2 (95%CI = 0.19 - 0.22)	0.93 (95%CI = 0.92 - 0.95)	0.41 (SE = 0.02)	1.33 (SE = 0.03)	1.7648	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.Hei.v1.zip	TRUE
HT	Height	EUR	UKB	304826	76470	0.24 (95%CI = 0.24 - 0.25)	0.91 (95%CI = 0.91 - 0.91)	0.45 (SE = 0.02)	1.39 (SE = 0.03)	2.3412	50	NA	TRUE
HbA1c	HbA1c	AFR	UGR	6116	5405	0 (95%CI = 0 - 0.05)	1 (95%CI = 0.9 - 1)	0.42 (SE = 0.15)	1.01 (SE = 0.01)	1.0405	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009054	FALSE
HbA1c	HbA1c	EAS	BBJ	71221	1949	0.08 (95%CI = 0.06 - 0.1)	0.98 (95%CI = 0.96 - 0.99)	0.07 (SE = 0.01)	1.05 (SE = 0.01)	1.0957	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.HbA1c.v1.zip	FALSE
HbA1c	HbA1c	EUR	UKB	292134	73182	0.14 (95%CI = 0.13 - 0.14)	0.93 (95%CI = 0.93 - 0.94)	0.2 (SE = 0.02)	1.17 (SE = 0.02)	1.5659	30750	NA	FALSE
LDL	LDL-cholesterol	AFR	UGR	13086	6313	0.07 (95%CI = 0.07 - 0.08)	1 (95%CI = 1 - 1)	0.03 (SE = 0.05)	1.03 (SE = 0.01)	1.0285	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009043	FALSE
LDL	LDL-cholesterol	EAS	BBJ	72866	1953	0.05 (95%CI = 0.03 - 0.06)	1 (95%CI = 0.99 - 1)	0.07 (SE = 0.01)	1.05 (SE = 0.01)	1.0957	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.LDLC.v1.zip	FALSE
LDL	LDL-cholesterol	EUR	UKB	291538	73172	0.12 (95%CI = 0.11 - 0.12)	0.97 (95%CI = 0.97 - 0.97)	0.1 (SE = 0.01)	1.06 (SE = 0.01)	1.2697	30780	NA	FALSE
LYM	Lymphocyte	AFR	UGR	2681	6353	0 (95%CI = 0 - 0.06)	1 (95%CI = 0 - 1)	0.08 (SE = 0.28)	1.01 (SE = 0.01)	1.0135	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009037	FALSE
LYM	Lymphocyte	EAS	BBJ	95717	1997	0.16 (95%CI = 0.14 - 0.17)	0.94 (95%CI = 0.91 - 0.96)	0.08 (SE = 0.01)	1.05 (SE = 0.01)	1.1459	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.LYM.v1.zip	FALSE
LYM	Lymphocyte	EUR	UKB	296570	74442	0.08 (95%CI = 0.08 - 0.08)	0.94 (95%CI = 0.94 - 0.94)	0.18 (SE = 0.01)	1.18 (SE = 0.02)	1.6070	30120	NA	FALSE
MCH	Mean corpuscular hemoglobin	AFR	UGR	2742	6375	0.21 (95%CI = 0.18 - 0.27)	1 (95%CI = 1 - 1)	0.26 (SE = 0.22)	1.01 (SE = 0.01)	1.0225	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009063	FALSE
MCH	Mean corpuscular hemoglobin	EAS	BBJ	128028	1999	0.12 (95%CI = 0.1 - 0.13)	0.98 (95%CI = 0.97 - 0.98)	0.19 (SE = 0.03)	1.11 (SE = 0.02)	1.2005	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.MCH.v1.zip	FALSE
MCH	Mean corpuscular hemoglobin	EUR	UKB	297080	74575	0.19 (95%CI = 0.18 - 0.19)	0.94 (95%CI = 0.94 - 0.95)	0.25 (SE = 0.03)	1.12 (SE = 0.02)	1.4781	30050	NA	FALSE
MCHC	Mean corpuscular hemoglobin concentration	AFR	UGR	2744	6375	0.05 (95%CI = 0.04 - 0.16)	1 (95%CI = 1 - 1)	0.01 (SE = 0.23)	1.01 (SE = 0.01)	1.0225	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009064	TRUE
MCHC	Mean corpuscular hemoglobin concentration	EAS	BBJ	135482	1999	0.04 (95%CI = 0.03 - 0.05)	0.99 (95%CI = 0.98 - 1)	0.07 (SE = 0.01)	1.06 (SE = 0.01)	1.1459	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.MCHC.v1.zip	TRUE
MCHC	Mean corpuscular hemoglobin concentration	EUR	UKB	297079	74573	0.05 (95%CI = 0.05 - 0.05)	0.97 (95%CI = 0.97 - 0.97)	0.06 (SE = 0.01)	1.04 (SE = 0.01)	1.1843	30060	NA	TRUE
MCV	Mean corpuscular volume	AFR	UGR	2742	6375	0.23 (95%CI = 0.18 - 0.29)	1 (95%CI = 1 - 1)	0.41 (SE = 0.23)	1.01 (SE = 0.01)	1.0225	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009065	FALSE
MCV	Mean corpuscular volume	EAS	BBJ	129832	1999	0.12 (95%CI = 0.1 - 0.13)	0.98 (95%CI = 0.97 - 0.98)	0.22 (SE = 0.03)	1.1 (SE = 0.02)	1.2531	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.MCV.v1.zip	FALSE
MCV	Mean corpuscular volume	EUR	UKB	297083	74574	0.22 (95%CI = 0.21 - 0.22)	0.94 (95%CI = 0.93 - 0.94)	0.25 (SE = 0.03)	1.13 (SE = 0.02)	1.5144	30040	NA	FALSE
MON	Monocyte	AFR	UGR	2681	6353	0.15 (95%CI = 0.06 - 0.24)	1 (95%CI = 0 - 1)	-0.25 (SE = 0.22)	1.02 (SE = 0.01)	1.0135	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009038	FALSE
MON	Monocyte	EAS	BBJ	95119	1997	0.1 (95%CI = 0.09 - 0.12)	0.98 (95%CI = 0.97 - 0.99)	0.08 (SE = 0.01)	1.09 (SE = 0.01)	1.1459	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.MON.v1.zip	FALSE
MON	Monocyte	EUR	UKB	296570	74442	0.13 (95%CI = 0.13 - 0.14)	0.95 (95%CI = 0.95 - 0.95)	0.19 (SE = 0.02)	1.2 (SE = 0.02)	1.5144	30130	NA	FALSE
NEU	Neutrophil	AFR	UGR	2671	6353	0.11 (95%CI = 0.1 - 0.15)	1 (95%CI = 1 - 1)	0.1 (SE = 0.23)	1.01 (SE = 0.01)	1.0105	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009040	TRUE
NEU	Neutrophil	EAS	BBJ	82810	1997	0.07 (95%CI = 0.05 - 0.09)	0.99 (95%CI = 0.98 - 0.99)	0.11 (SE = 0.01)	1.05 (SE = 0.01)	1.1459	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.NEU.v1.zip	TRUE
NEU	Neutrophil	EUR	UKB	296570	74442	0.14 (95%CI = 0.14 - 0.15)	0.93 (95%CI = 0.93 - 0.94)	0.15 (SE = 0.01)	1.15 (SE = 0.02)	1.5401	30140	NA	TRUE
PLT	Platelet	AFR	UGR	2723	6375	0.11 (95%CI = 0.02 - 0.21)	0.99 (95%CI = 0 - 1)	0.26 (SE = 0.25)	1.01 (SE = 0.01)	1.0225	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009036	TRUE
PLT	Platelet	EAS	BBJ	148623	1999	0.15 (95%CI = 0.14 - 0.17)	0.95 (95%CI = 0.94 - 0.96)	0.18 (SE = 0.02)	1.13 (SE = 0.01)	1.3101	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.PLT.v1.zip	TRUE
PLT	Platelet	EUR	UKB	297082	74575	0.24 (95%CI = 0.24 - 0.24)	0.94 (95%CI = 0.93 - 0.94)	0.27 (SE = 0.02)	1.2 (SE = 0.02)	1.6108	30080	NA	TRUE
RBC	Red blood cell	AFR	UGR	2744	6375	0.32 (95%CI = 0.24 - 0.42)	0.99 (95%CI = 0.97 - 1)	0.54 (SE = 0.24)	1 (SE = 0.01)	1.0165	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009062	FALSE
RBC	Red blood cell	EAS	BBJ	153512	1999	0.08 (95%CI = 0.07 - 0.1)	0.97 (95%CI = 0.96 - 0.98)	0.12 (SE = 0.01)	1.12 (SE = 0.02)	1.2531	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.RBC.v1.zip	FALSE
RBC	Red blood cell	EUR	UKB	297084	74575	0.17 (95%CI = 0.17 - 0.18)	0.93 (95%CI = 0.93 - 0.93)	0.2 (SE = 0.02)	1.19 (SE = 0.02)	1.6108	30010	NA	FALSE
SBP	Sytolic blood pressure	AFR	UGR	13613	6658	0.17 (95%CI = 0.13 - 0.22)	0.94 (95%CI = 0 - 0.98)	0.06 (SE = 0.06)	1.02 (SE = 0.01)	1.0285	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009053	TRUE
SBP	Sytolic blood pressure	EAS	BBJ	145505	1976	0.11 (95%CI = 0.09 - 0.12)	0.94 (95%CI = 0.9 - 0.96)	0.07 (SE = 0.01)	1.08 (SE = 0.01)	1.2005	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.SBP.v1.zip	TRUE
SBP	Sytolic blood pressure	EUR	UKB	289696	72639	0.12 (95%CI = 0.12 - 0.12)	0.89 (95%CI = 0.88 - 0.9)	0.13 (SE = 0.01)	1.1 (SE = 0.01)	1.5995	4080	NA	TRUE
TC	Total cholesterol	AFR	UGR	13116	6324	0.05 (95%CI = 0.05 - 0.06)	1 (95%CI = 1 - 1)	0.02 (SE = 0.05)	1.03 (SE = 0.01)	1.0285	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009042	TRUE
TC	Total cholesterol	EAS	BBJ	135808	1955	0.08 (95%CI = 0.07 - 0.09)	0.99 (95%CI = 0.99 - 1)	0.07 (SE = 0.01)	1.07 (SE = 0.01)	1.1459	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.TC.v1.zip	TRUE
TC	Total cholesterol	EUR	UKB	292046	73305	0.12 (95%CI = 0.12 - 0.12)	0.96 (95%CI = 0.96 - 0.97)	0.11 (SE = 0.01)	1.07 (SE = 0.02)	1.3068	30690	NA	TRUE
TG	Triglycerides	AFR	UGR	13115	6323	0.11 (95%CI = 0.08 - 0.15)	1 (95%CI = 0.99 - 1)	0.08 (SE = 0.05)	1.02 (SE = 0.01)	1.0315	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009045	FALSE
TG	Triglycerides	EAS	BBJ	111667	1954	0.07 (95%CI = 0.06 - 0.09)	0.99 (95%CI = 0.99 - 1)	0.13 (SE = 0.03)	1.05 (SE = 0.01)	1.1459	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.TG.v1.zip	FALSE
TG	Triglycerides	EUR	UKB	291813	73241	0.16 (95%CI = 0.16 - 0.16)	0.94 (95%CI = 0.93 - 0.94)	0.18 (SE = 0.02)	1.14 (SE = 0.02)	1.5696	30870	NA	FALSE
WBC	White blood cell	AFR	UGR	2741	6375	0.03 (95%CI = 0.03 - 0.05)	1 (95%CI = 1 - 1)	0.03 (SE = 0.26)	1.01 (SE = 0.01)	1.0105	NA	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST009001-GCST010000/GCST009061	FALSE
WBC	White blood cell	EAS	BBJ	154355	1999	0.12 (95%CI = 0.11 - 0.14)	0.93 (95%CI = 0.91 - 0.95)	0.11 (SE = 0.01)	1.1 (SE = 0.01)	1.2531	NA	https://humandbs.dbcls.jp/files/hum0197/hum0197.v3.BBJ.WBC.v1.zip	FALSE
WBC	White blood cell	EUR	UKB	297079	74575	0.14 (95%CI = 0.14 - 0.14)	0.92 (95%CI = 0.92 - 0.92)	0.17 (SE = 0.01)	1.18 (SE = 0.02)	1.6334	30000	NA	FALSE

Main analysis

PGS calculation

We will do this using GenoPred.

Show code

Prepare configuration

######
# gwas_list
######

library(data.table)

# Subset original gwas_list to include selected traits
gwas_list<-fread('/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_all.txt')
pheno<-gsub('_.*','', gwas_list$name)
selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', header=F)$V1
gwas_list<-gwas_list[pheno %in% selected_traits,]
gwas_list$label<-paste0('"', gwas_list$label, '"')

write.table(
  gwas_list, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list.txt', 
  col.names = T, 
  row.names = F, 
  quote = F)

######
# gwas_groups
######

gwas_groups_eas<-data.frame(
  name=paste0(selected_traits, '_UKB_BBJ'),
  gwas=sapply(selected_traits, function(x) paste0(x,'_UKB,',x,'_BBJ')),
  label=paste0('"', selected_traits, " (UKB+BBJ)", '"')
)

gwas_groups_afr<-data.frame(
  name=paste0(selected_traits, '_UKB_UGR'),
  gwas=sapply(selected_traits, function(x) paste0(x,'_UKB,',x,'_UGR')),
  label=paste0('"', selected_traits, " (UKB+UGR)", '"')
)

gwas_groups<-rbind(gwas_groups_eas, gwas_groups_afr)

write.table(gwas_groups, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_groups.txt', col.names = T, row.names = F, quote = F)

######
# config
######

config<-c(
  "outdir: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/output",
  "config_file: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config.yaml",
  "gwas_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list.txt",
  "target_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/basic/target_list.txt",
  "gwas_groups: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_groups.txt",
  "pgs_methods: ['ptclump','quickprs','dbslmm','lassosum','megaprs','prscs','ldpred2','sbayesrc','prscsx','xwing']",
  "leopard_methods: ['ptclump','quickprs','dbslmm','lassosum','megaprs','prscs','ldpred2','sbayesrc']",
  "cores_prep_pgs: 10", # xwing run with 20 cores
  "cores_target_pgs: 50",
  "ldpred2_inference: F",
  "ldpred2_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/ldpred2/hm3",
  "quickprs_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/quickprs/hm3",
  "quickprs_multi_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/quickprs/hm3_subset",
  "sbayesrc_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/sbayesrc/hm3"
)

write.table(config, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config.yaml', col.names = F, row.names = F, quote = F)

Run pipeline

snakemake \
  --profile slurm \
  --use-conda \
  --configfile=/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config.yaml \
  target_pgs  -n

Note: The LD reference data for SBayesRC, LDpred2, QuickPRS, and QuickPRS+LEOPARD can be download using the links below:

LDpred2
- AFR: downoad link
- EAS: download link
- EUR: download link
SBayesRC
- AFR: download link
- EAS: download link
- EUR: download link

PGS evaluation

Lets use the model builder script which implements nested 10 fold cross validation. Similar set up to previous paper, evaluating a model containing the best PGS selected by 10-fold cross validation, a model containing the PGS selected by pseudovalidation (if available), and an elastic net model containing all PGS from a given method. We will need to update the model builder script to achieve this

We want to see: - Performance of pseudo and top1 models for single-source methods - Performance of pseudo and top1 models for multi-source methods - Performance of multi-source methods: - Using crossval for tuning step 1 and 2 - Using pseudoval for tuning step 1 and 2 - Using pseudoval for tuning step 1 and crossval for tuning step 2

To achieve this. Will need to define groups of predictors for step 1 modelling, and groups that should then be linearly combined.

Show code

Create predictor list

setwd('~/oliverpainfel/Software/MyGit/GenoPred/pipeline/')
source('../functions/misc.R')
source_all('../functions')
library(data.table)

# Get some key variables from config
config<-'/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config.yaml'
pgs_methods <- read_param(config = config, param = 'pgs_methods', return_obj = F)
outdir <- read_param(config = config, param = 'outdir', return_obj = F)

# Read in list of outcomes 
selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', header=F)$V1

# Get a list of score files
scores <- list_score_files(config)

# Create files for EAS and AFR targets
targ_pop <- c('EUR','EAS','AFR')
for(trait_i in selected_traits){
  scores_i <- scores[grepl(trait_i, scores$name),]
  scores_i$multi <- scores_i$method
  
  for(targ_pop_i in targ_pop){
    if(targ_pop_i == 'EAS'){
      disc_pop <- 'BBJ'
    }
    if(targ_pop_i == 'AFR'){
      disc_pop <- 'UGR'
    }
    if(targ_pop_i == 'EUR'){
      disc_pop <- c('BBJ','UGR')
    }
    
    for(disc_pop_j in disc_pop){
      if(disc_pop_j == 'BBJ'){
        disc_pop_j_2 <- 'EAS'
      }
      if(disc_pop_j == 'UGR'){
        disc_pop_j_2 <- 'AFR'
      }

      dir.create(
        paste0(
          '/users/k1806347/oliverpainfel/Analyses/crosspop/targ_',
          targ_pop_i,
          '.disc_EUR_',
          disc_pop_j_2,
          '/',
          trait_i
        ),
        recursive = T
      )
      
      scores_i_j <- scores_i[
        (grepl('UKB$', scores_i$name, ignore.case = F) | 
         grepl(paste0(disc_pop_j, '$'), scores_i$name, ignore.case = T)),]

      # Insert path to score file
      scores_i_j$predictor <- paste0(
        outdir,
        '/ukb/pgs/TRANS/',
        scores_i_j$method,
        '/',
        scores_i_j$name,
        '/ukb-',
        scores_i_j$name,
        '-TRANS.profiles'
      )
      
      ####
      # Make groups single source methods
      ####
      
      scores_i_j_single_top1 <-
        scores_i_j[!(scores_i_j$method %in% pgs_group_methods) &
                     !grepl('_multi$', scores_i_j$method), ]

      # Create top1 column indicating which predictors top1 models should be derived
      scores_i_j_single_top1$top1[grepl('UKB', scores_i_j_single_top1$name, ignore.case = F)] <- 'EUR'
      scores_i_j_single_top1$top1[grepl(disc_pop_j, scores_i_j_single_top1$name, ignore.case = F)] <- disc_pop_j_2
      
      ####
      # Make groups containing pseudo scores for single source methods
      ####

      # Extract the pseudo score for each method and specify as a separate group
      for(i in 1:nrow(scores_i_j_single_top1)) {
        param <- find_pseudo(
          config = config,
          gwas = scores_i_j_single_top1$name[i],
          pgs_method = scores_i_j_single_top1$method[i],
          target_pop = targ_pop_i
        )
        
        score_header <-
          fread(scores_i_j_single_top1$predictor[i], nrows = 1)
        score_cols <-
          which(names(score_header) %in% c('FID', 'IID', paste0(scores_i_j_single_top1$name[i], '_', param)))
        
        system(
          paste0(
            "cut -d' ' -f ", 
            paste0(score_cols, collapse=','),
            " ", 
            scores_i_j_single_top1$predictor[i], 
            " > ", 
            gsub('.profiles',
                 paste0('.', targ_pop_i, '_pseudo.profiles'),
                 scores_i_j_single_top1$predictor[i])
          )
        )
      }
      
      scores_i_j_single_pseudo <- scores_i_j_single_top1
      scores_i_j_single_pseudo$multi <- paste0(scores_i_j_single_pseudo$multi, '.pseudo')

      scores_i_j_single_pseudo$predictor <- gsub('.profiles', 
                                    paste0('.', targ_pop_i, '_pseudo.profiles'),
                                    scores_i_j_single_pseudo$predictor)

      ####
      # Make groups for multi-single-source pseudo scores
      ####
      
      scores_i_j_multi_single_pseudo <- scores_i_j[grepl('_multi$', scores_i_j$method),]

      # Extract the pseudo score for each method and specify as a separate group
      for(i in 1:nrow(scores_i_j_multi_single_pseudo)) {
        param <- find_pseudo(
          config = config,
          gwas = scores_i_j_multi_single_pseudo$name[i],
          pgs_method = scores_i_j_multi_single_pseudo$method[i],
          target_pop = targ_pop_i
        )
        
        score_header <-
          fread(scores_i_j_multi_single_pseudo$predictor[i], nrows = 1)
        score_cols <-
          which(names(score_header) %in% c('FID', 'IID', paste0(scores_i_j_multi_single_pseudo$name[i], '_', param)))
        
        system(
          paste0(
            "cut -d' ' -f ", 
            paste0(score_cols, collapse=','),
            " ", 
            scores_i_j_multi_single_pseudo$predictor[i], 
            " > ", 
            gsub('.profiles',
                 paste0('.', targ_pop_i, '_pseudo.profiles'),
                 scores_i_j_multi_single_pseudo$predictor[i])
          )
        )
      }
      
      scores_i_j_multi_single_pseudo$multi <- paste0(scores_i_j_multi_single_pseudo$multi, '.pseudo')

      scores_i_j_multi_single_pseudo$predictor <- gsub('.profiles', 
                                    paste0('.', targ_pop_i, '_pseudo.profiles'),
                                    scores_i_j_multi_single_pseudo$predictor)
      
      scores_i_j_multi_single_pseudo$top1<-paste0('EUR_', disc_pop_j_2)

      ####
      # Make groups for the Multi-Source methods
      ####
      
      scores_i_j_multi <- scores_i_j[(scores_i_j$method %in% pgs_group_methods),]

      # Split top1 scores by target population
      # This doesn't apply to xwing because it only has pop-specific pseudo scores
      scores_i_j_multi_top1<-NULL
      for(i in 1:which(scores_i_j_multi$method %in% c('prscsx'))){
        score_header<-fread(scores_i_j_multi$predictor[i], nrow = 1)
        
        for(pop in c('EUR', disc_pop_j_2)){
          
          if(scores_i_j_multi$method[i] == 'prscsx'){
            score_cols <-
              which(grepl(paste0('^FID$|^IID$|_', pop, '_phi'), names(score_header)))
          }
          if(scores_i_j_multi$method[i] == 'xwing'){
            score_cols <-
              which(grepl(paste0('^FID$|^IID$|_targ_', pop, '_pst'), names(score_header)))
          }
          
          system(
            paste0(
              "cut -d' ' -f ", 
              paste0(score_cols, collapse=','),
              " ", 
              scores_i_j_multi$predictor[i], 
              " > ", 
              gsub('.profiles',
                   paste0('.', pop, '_grid.profiles'),
                   scores_i_j_multi$predictor[i])
            )
          )
          
          tmp <- scores_i_j_multi[i,]
          tmp$multi <- paste0(tmp$multi, '.grid')
          tmp$top1 <- pop
          tmp$predictor <-
              gsub('.profiles',
                   paste0('.', pop, '_grid.profiles'),
                   scores_i_j_multi$predictor[i])
          
          scores_i_j_multi_top1 <- rbind(scores_i_j_multi_top1, tmp)
        }
      }

      # Split pop-specific pseudo scores by target population
      scores_i_j_multi_pop_pseudo<-NULL
      for(i in 1:nrow(scores_i_j_multi)){
        score_header<-fread(scores_i_j_multi$predictor[i], nrow = 1)
        
        for(pop in c('EUR', disc_pop_j_2)){
          if(scores_i_j_multi$method[i] == 'prscsx'){
            score_cols <-
              which(grepl(paste0('^FID$|^IID$|_', pop, '_phi_auto'), names(score_header)))
          }
          if(scores_i_j_multi$method[i] == 'xwing'){
            score_cols <-
              which(grepl(paste0('^FID$|^IID$|_targ_', pop, '_pst_', pop), names(score_header)))
          }
          
          system(
            paste0(
              "cut -d' ' -f ", 
              paste0(score_cols, collapse=','),
              " ", 
              scores_i_j_multi$predictor[i], 
              " > ", 
              gsub('.profiles',
                   paste0('.', pop, '_pseudo.profiles'),
                   scores_i_j_multi$predictor[i])
            )
          )
          
          tmp <- scores_i_j_multi[i,]
          tmp$multi <- paste0(tmp$multi, '.pop_pseudo')
          tmp$top1 <- pop
          tmp$predictor <-
              gsub('.profiles',
                   paste0('.', pop, '_pseudo.profiles'),
                   scores_i_j_multi$predictor[i])
          
          scores_i_j_multi_pop_pseudo <- rbind(scores_i_j_multi_pop_pseudo, tmp)
        }
      }
      
      # Create pseudo score for multi-source methods
      scores_i_j_multi_pseudo<-NULL
      for(i in 1:nrow(scores_i_j_multi)) {
        param <- find_pseudo(
          config = config,
          gwas = scores_i_j_multi$name[i],
          pgs_method = scores_i_j_multi$method[i],
          target_pop = targ_pop_i
        )
        
        score_header <-
          fread(scores_i_j_multi$predictor[i], nrows = 1)
        score_cols <-
          which(names(score_header) %in% c('FID', 'IID', paste0(scores_i_j_multi$name[i], '_', param)))

        system(
          paste0(
            "cut -d' ' -f ", 
            paste0(score_cols, collapse=','),
            " ", 
            scores_i_j_multi$predictor[i], 
            " > ", 
            gsub('.profiles',
                 paste0('.pseudo.targ_', targ_pop_i,'.profiles'),
                 scores_i_j_multi$predictor[i])
          )
        )
        
        tmp <- scores_i_j_multi[i,]
        tmp$multi <- paste0(tmp$multi, '.pseudo')
        tmp$top1 <- paste0('EUR_', disc_pop_j_2)
        tmp$predictor <-
            gsub('.profiles',
                 paste0('.pseudo.targ_', targ_pop_i,'.profiles'),
                 scores_i_j_multi$predictor[i])
        
        scores_i_j_multi_pseudo <- rbind(scores_i_j_multi_pseudo, tmp)
      }
      
      ####
      # Combine the different predictor groups
      ####
      predictors_i<- do.call(rbind, list(
        scores_i_j_single_top1, 
        scores_i_j_single_pseudo, 
        scores_i_j_multi_single_pseudo,
        scores_i_j_multi_top1,
        scores_i_j_multi_pop_pseudo,
        scores_i_j_multi_pseudo
      ))
      
      predictors_i <- predictors_i[, c('predictor', 'multi','top1'), with=F]
      
      ####
      # Make a group that will combined all population specific PGS
      ####
      
      predictors_i_all <- predictors_i[predictors_i$top1 %in% c('EUR','AFR','EAS'),]
      predictors_i_all$multi <- 'all'
      predictors_i<-rbind(predictors_i, predictors_i_all)
      
      write.table(
        predictors_i,
        paste0(
          '/users/k1806347/oliverpainfel/Analyses/crosspop/targ_',
          targ_pop_i,
          '.disc_EUR_',
          disc_pop_j_2,
          '/',
          trait_i,
          '/predictor_list.txt'
        ),
        col.names = T,
        row.names = F,
        quote = F
      )
    }
  }
}

Run model_builder

cd /users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline
conda activate model_builder

#rm /users/k1806347/oliverpainfel/Analyses/crosspop/targ_*.disc_EUR_*/*/res*

for targ_pop in $(echo EUR EAS AFR); do
  if [ "$targ_pop" == "EUR" ]; then
      targ_pop2="EUR_test"
  else
      targ_pop2=$targ_pop
  fi
  
  if [ "$targ_pop" == "EUR" ]; then
    disc_pop=$(echo EAS AFR)
  fi
  
  if [ "$targ_pop" == "EAS" ]; then
    disc_pop="EAS"
  fi
  
  if [ "$targ_pop" == "AFR" ]; then
    disc_pop="AFR"
  fi
  
  for disc_pop_i in ${disc_pop}; do
    for pheno in $(cat /users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt); do
      if [ ! -f "/users/k1806347/oliverpainfel/Analyses/crosspop/targ_${targ_pop}.disc_EUR_${disc_pop_i}/${pheno}/res.pred_comp.txt" ]; then
        sbatch --mem 10G -n 5 --exclude=erc-hpc-comp058 -p neurohack_cpu,interruptible_cpu -t 1:00:00 --wrap="Rscript ../Scripts/model_builder/model_builder_top1.R \
          --outcome /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/${pheno}.unrel.${targ_pop2}.row_number.txt \
          --predictors /users/k1806347/oliverpainfel/Analyses/crosspop/targ_${targ_pop}.disc_EUR_${disc_pop_i}/${pheno}/predictor_list.txt \
          --out /users/k1806347/oliverpainfel/Analyses/crosspop/targ_${targ_pop}.disc_EUR_${disc_pop_i}/${pheno}/res \
          --n_core 5"
      fi
    done
  done
done

Plot results

setwd('/users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline/')

library(data.table)
library(ggplot2)
library(cowplot)

source('../functions/misc.R')
source_all('../functions')

# Read in list of outcomes 
selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', header=F)$V1
info_all <- fread('~/oliverpainfel/Analyses/crosspop/gwas_descriptives.csv')

# Calculate correlation between all phenotypes in each target population
cors <- list()
for(pop_i in c('EUR','EAS','AFR','CSA','AMR')){
  if(pop_i == 'EUR'){
    pop_i_2 <- 'EUR_test'
  } else {
    pop_i_2 <- pop_i
  }
  pheno_pop_i <- list()
  for(pheno_i in selected_traits){
    pheno_pop_i[[pheno_i]] <- fread(paste0('/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/', pheno_i, '.unrel.', pop_i_2, '.row_number.txt'))
    names(pheno_pop_i[[pheno_i]])[3] <- pheno_i
  }
  
  pheno_pop_i_merged <- merged_df <- Reduce(function(x, y) merge(x, y, all = TRUE, by = c('FID','IID')), pheno_pop_i)

  cors_i <- abs(cor(as.matrix(pheno_pop_i_merged[,-1:-2, with=F]), use='p'))
  cors[[pop_i]] <- cors_i
}

# Read in results
targ_pop = c('EUR','EAS','AFR')
res_eval <- list()
for(pheno_i in selected_traits){
  res_eval_i<-NULL
  for(targ_pop_i in targ_pop){
    if(targ_pop_i == 'EAS'){
      disc_pop <- 'EAS'
    }
    if(targ_pop_i == 'AFR'){
      disc_pop <- 'AFR'
    }
    if(targ_pop_i == 'EUR'){
      disc_pop <- c('EAS','AFR')
    }
    for(disc_pop_i in disc_pop){
      eval_i <-
        fread(
          paste0(
            '/users/k1806347/oliverpainfel/Analyses/crosspop/',
            'targ_',
            targ_pop_i,
            '.disc_EUR_',
            disc_pop_i,
            '/',
            pheno_i,
            '/res.pred_eval.txt'
          )
        )
      eval_i$Target<-targ_pop_i
      eval_i$gwas_group<-paste0('EUR+', disc_pop_i)
      res_eval_i<-rbind(res_eval_i, eval_i)
    }
  }
  
  res_eval_i$Method<-sub('\\..*','',res_eval_i$Group)
  res_eval_i$Method<-gsub('-.*','', res_eval_i$Method)
  
  res_eval_i$Model[grepl('top1$', res_eval_i$Group) &
                   !grepl('pseudo', res_eval_i$Group)]<-'IndivTune'
  res_eval_i$Model[grepl('top1$', res_eval_i$Group) &
                   grepl('pseudo', res_eval_i$Group)]<-'SumStatTune'
  res_eval_i$Model[grepl('multi$', res_eval_i$Group) &
                   !grepl('pseudo', res_eval_i$Group)]<-'Multi-IndivTune'
  res_eval_i$Model[grepl('multi$', res_eval_i$Group) &
                   grepl('pseudo', res_eval_i$Group)]<-'Multi-SumStatTune'
  
  res_eval_i$Model[grepl('_multi', res_eval_i$Group)]<-'SumStatTune'
  res_eval_i$Model[res_eval_i$Group == 'prscsx.pseudo.multi']<-'SumStatTune'
  res_eval_i$Model[res_eval_i$Group == 'xwing.pseudo.multi']<-'SumStatTune'
  
  res_eval_i$Source<-ifelse(
    res_eval_i$Method %in% pgs_group_methods | grepl('_multi$', res_eval_i$Method) | 
    !grepl('EUR|EAS|AFR', res_eval_i$Group), 'Multi', 'Single')
  
  res_eval_i$Discovery[grepl('EUR', res_eval_i$Group)] <- 'EUR'
  res_eval_i$Discovery[grepl('EAS', res_eval_i$Group)] <- 'EAS'
  res_eval_i$Discovery[grepl('AFR', res_eval_i$Group)] <- 'AFR'
  res_eval_i$Discovery[res_eval_i$Source == 'Multi'] <- res_eval_i$gwas_group[res_eval_i$Source == 'Multi']
  
  res_eval_i$Method<-factor(res_eval_i$Method, levels=unique(res_eval_i$Method))
  res_eval_i$Model<-factor(res_eval_i$Model, levels=c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
  res_eval_i$Discovery<-factor(res_eval_i$Discovery, levels=c('AFR','EAS','EUR','EUR+AFR','EUR+EAS'))

  # Remove IndivTune and Multi-IndivTune model for groups that contain one score (aka QuickPRS and SBayesRC)
  res_eval_i <- res_eval_i[
    !(res_eval_i$Method %in%  c('quickprs','sbayesrc') & 
      res_eval_i$Model %in% c('IndivTune','Multi-IndivTune')),]
  
  # Remove pseudo model for methods that don't really have one 
  res_eval_i <- res_eval_i[
    !(res_eval_i$Method %in%  c('ptclump','ptclump_multi') & 
      res_eval_i$Model %in% c('SumStatTune','Multi-SumStatTune')),]

  # Remove top1 models for *-Multi, PRS-CSx, X-wing
  res_eval_i <- res_eval_i[
    !((res_eval_i$Method %in%  c('prscsx', 'xwing') | grepl('_multi$', res_eval_i$Method)) & 
      grepl('top1', res_eval_i$Group)),]
  
  # Remove any duplicate models
  res_eval_i <- res_eval_i[!duplicated(res_eval_i[, c(
    "Target", "Method", "Model", "Source", "Discovery","gwas_group"
  )]),]
  
  res_eval[[pheno_i]]<-res_eval_i
  
}

# Create vector defining or of methods in plots
model_order <- c("DBSLMM", "lassosum", "LDpred2", "MegaPRS", "PRS-CS", "pT+clump", "QuickPRS", "SBayesRC", "DBSLMM-multi", "lassosum-multi", "LDpred2-multi", "MegaPRS-multi", "PRS-CS-multi", "pT+clump-multi", "QuickPRS-multi", "SBayesRC-multi", "PRS-CSx", "X-Wing", "All") 

res_eval_simp <- NULL
for(pheno_i in selected_traits){
  tmp <- res_eval[[pheno_i]]
  tmp$Trait <- pheno_i
  
  # Insert nice PGS method names
  tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x = T)
  tmp$label[is.na(tmp$label)] <- 'All'
  tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'] <- paste0(tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'], '-multi')
  tmp$label <- factor(tmp$label, levels = model_order)
  
  # Simplify result to either SumStatTune or IndivTune
  tmp$Model[tmp$Model != 'SumStatTune'] <- 'IndivTune'
  tmp$Model[tmp$Model == 'SumStatTune'] <- 'SumStatTune'
  tmp <- tmp[!duplicated(tmp[, c('label','Target','Discovery','Model'), with=F]),]
  
  res_eval_simp <- rbind(res_eval_simp, tmp)
}

# Count the number of traits each method is best
tmp <- res_eval_simp[res_eval_simp$label != 'All',]
best_groups <-
  do.call(rbind, by(tmp, list(
    tmp$Target,
    tmp$gwas_group,
    tmp$Trait
  ), function(subset) {
    subset[which.max(subset$R),]  # Select row with max R
  }))

best_counts <- as.data.frame(table(paste0(best_groups$label,':', best_groups$Model), best_groups$gwas_group, best_groups$Target))

# Rename columns
colnames(best_counts) <- c("label", "gwas_group", "Target", "count")
best_counts$Model<-gsub('.*:','',best_counts$label)
best_counts$label<-gsub(':.*','',best_counts$label)
best_counts$label <- factor(best_counts$label, levels = model_order)

# Remove zero counts to declutter the plot
best_counts <- best_counts[best_counts$count > 0, ]

# Create the plot
ggplot(best_counts[best_counts$Target != 'EUR',], aes(x = label, y = count, fill = Model)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ Target, scales = 'free_x') +
  theme_half_open() +
  background_grid(major = 'y', minor = 'y') + 
  panel_border() + 
  labs(
    title = "Number of times each method is the best",
    x = "Method",
    y = "Count",
    fill = "GWAS Group"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#############################
# Identify best methods that improved prediction over next best method by 2% for any trait
# Filter out 'All' from the data
tmp <- res_eval_simp[res_eval_simp$label != 'All',]

# Identify the best method for each trait, but only if it improves by >2%
best_groups <- do.call(rbind, by(tmp, list(tmp$Target, tmp$gwas_group, tmp$Trait), function(subset) {
  if (nrow(subset) > 1) {
    # Sort by R in descending order
    subset <- subset[order(-subset$R), ]
    # Check if the best method is more than 2% better than the second best
    if ((subset$R[1] - subset$R[2]) / subset$R[2] > 0.02) {
      return(subset[1, ])  # Return the best method if criteria met
    } 
  } else {
    return(subset[1, ])  # Handle cases with only one method
  }
  return(NULL)  # Return NULL if criteria not met
}))

# Create a count table with label and model combined
best_counts <- as.data.frame(table(paste0(best_groups$label,':', best_groups$Model), 
                                   best_groups$gwas_group, best_groups$Target))

# Rename columns
colnames(best_counts) <- c("label", "gwas_group", "Target", "count")
best_counts$Model <- gsub('.*:', '', best_counts$label)
best_counts$label <- gsub(':.*', '', best_counts$label)
best_counts$label <- factor(best_counts$label, levels = model_order)

# Remove zero counts to declutter the plot
best_counts <- best_counts[best_counts$count > 0, ]

# Create the plot
library(ggplot2)
ggplot(best_counts[best_counts$Target != 'EUR',], aes(x = label, y = count, fill = Model)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ Target, scales = 'free_x') +
  theme_minimal() +
  labs(
    title = "Number of times each method is the best (with >2% improvement)",
    x = "Method",
    y = "Count",
    fill = "Model"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


#############################


# Plot results for each phenotype separately
dir.create('~/oliverpainfel/Analyses/crosspop/plots')

for(pheno_i in selected_traits){
  tmp <- res_eval_simp[res_eval_simp$Trait == pheno_i,]
  #tmp <- tmp[tmp$Target != 'EUR',]
  tmp$Discovery_clean <- as.character(tmp$Discovery)
  tmp$Discovery_clean <- paste0(tmp$Discovery_clean, ' GWAS')
  tmp$Target <- paste0(tmp$Target, ' Target')

  png(paste0('~/oliverpainfel/Analyses/crosspop/plots/', pheno_i,'.png'), res=300, width = 3400, height = 2000, units = 'px')
  plot_tmp<-ggplot(tmp, aes(x=label, y=R , fill = Model)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x=NULL, fill = NULL, title = info_all$`Trait Description`[info_all$`Trait Label` == pheno_i]) +
    facet_grid(Target ~ Discovery_clean, scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
          legend.position = "top",
          legend.key.spacing.x = unit(1, "cm"),
          legend.justification = "center")
  print(plot_tmp)
  dev.off()
}

####
# Average results across phenotypes
####

library(MAd)

# Average R across phenotypes
meta_res_eval <- NULL
for(targ_pop_i in targ_pop){
  if(targ_pop_i == 'EAS'){
    disc_pop <- 'EAS'
  }
  if(targ_pop_i == 'AFR'){
    disc_pop <- 'AFR'
  }
  if(targ_pop_i == 'EUR'){
    disc_pop <- c('EAS','AFR')
  }
  for(disc_pop_i in disc_pop){
  
    # Subset res_eval for each scenario
    res_eval_i <- do.call(rbind, lapply(seq_along(res_eval), function(i) {
      x <- res_eval[[i]]
      x$pheno <- names(res_eval)[i]
      x <- x[x$Target == targ_pop_i]
      x <- x[x$gwas_group == paste0('EUR+', disc_pop_i)]
    }))
    
    # Average res_evalults for each test across phenotypes
    # Use MAd to account for correlation between them
    res_eval_i$Sample<-'A'
  
    for(group_i in unique(res_eval_i$Group)){
      res_eval_group_i <- res_eval_i[res_eval_i$Group == group_i,]
      missing_pheno <-
        colnames(cors[[targ_pop_i]])[!(colnames(cors[[targ_pop_i]]) %in% unique(res_eval_group_i$pheno))]
      
      if (!all(colnames(cors[[targ_pop_i]]) %in% unique(res_eval_group_i$pheno))) {
        print(paste0(
          'res_evalults missing for ',
          targ_pop_i,
          ' ',
          group_i,
          ' ',
          paste0(missing_pheno, collapse = ' ')
        ))
      }
      
      cors_i <- cors[[targ_pop_i]][unique(res_eval_group_i$pheno), unique(res_eval_group_i$pheno)]
      
      meta_res_eval_i <-
        agg(
          id = Sample,
          es = R,
          var = SE ^ 2,
          cor = cors_i,
          method = "BHHR",
          mod = NULL,
          data = res_eval_group_i
        )
      
      tmp <- data.table(Group = group_i,
                        Method = res_eval_group_i$Method[1],
                        Model = res_eval_group_i$Model[1],
                        Source = res_eval_group_i$Source[1],
                        Discovery = res_eval_group_i$Discovery[1],
                        gwas_group = res_eval_group_i$gwas_group[1],
                        Target = targ_pop_i,
                        R = meta_res_eval_i$es,
                        SE = sqrt(meta_res_eval_i$var))
      
      meta_res_eval <- rbind(meta_res_eval, tmp)
    }
  }
}

meta_res_eval$Model<-factor(meta_res_eval$Model, levels=c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
meta_res_eval$Discovery<-factor(meta_res_eval$Discovery, levels=c('AFR','EAS','EUR','EUR+AFR','EUR+EAS'))

write.csv(meta_res_eval, '~/oliverpainfel/Analyses/crosspop/r_eval.csv', row.names = F)

# Plot average performance across phenotypes for AFR and EAS targets
tmp <- meta_res_eval
tmp <- tmp[tmp$Target != 'EUR',]
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'] <- paste0(tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'], '-multi')
tmp$label <- factor(tmp$label, levels = model_order)
tmp$Discovery_clean <- as.character(tmp$Discovery)
tmp$Discovery_clean[tmp$Discovery == 'EUR'] <- 'EUR GWAS'
tmp$Discovery_clean[tmp$Discovery != 'EUR' & tmp$Source == 'Single'] <- 'Target-matched GWAS'
tmp$Discovery_clean[tmp$Discovery != 'EUR' & tmp$Source == 'Multi'] <- 'Both'
tmp$Discovery_clean <- factor(tmp$Discovery_clean, 
                              levels = c('Target-matched GWAS',
                                         'EUR GWAS',
                                         'Both'))
tmp$Target <- paste0(tmp$Target, ' Target')
tmp$Model[tmp$Model != 'SumStatTune'] <- 'IndivTune'
tmp$Model[tmp$Model == 'SumStatTune'] <- 'SumStatTune'
tmp <- tmp[!duplicated(tmp[, c('label','Target','Discovery_clean','Model'), with=F]),]

png(paste0('~/oliverpainfel/Analyses/crosspop/plots/average_r.png'), res=300, width = 3200, height = 2000, units = 'px')
ggplot(tmp, aes(x=label, y=R , fill = Model)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x='Method') +
    facet_grid(Target ~ Discovery_clean, scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
          legend.position = "top",
          legend.key.spacing.x = unit(1, "cm"),
          legend.justification = "center")
dev.off()

# Plot average performance across phenotypes for EUR using AFR or EAS GWAS
tmp <- meta_res_eval
tmp <- tmp[tmp$Target == 'EUR',]
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'] <- paste0(tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'], '-multi')
tmp$label <- factor(tmp$label, levels = model_order)
tmp$Discovery_clean <- as.character(tmp$Discovery)
tmp$Discovery_clean <- paste0(tmp$Discovery_clean, ' GWAS')
tmp$Target <- paste0(tmp$Target, ' Target')
tmp$Model[tmp$Model != 'SumStatTune'] <- 'IndivTune'
tmp$Model[tmp$Model == 'SumStatTune'] <- 'SumStatTune'
tmp <- tmp[!duplicated(tmp[, c('label','Target','Discovery_clean','Model'), with=F]),]

png(paste0('~/oliverpainfel/Analyses/crosspop/plots/average_r_eur.png'), res=300, width = 4000, height = 1500, units = 'px')
ggplot(tmp, aes(x=label, y=R , fill = Model)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x='Method') +
    facet_grid(Target ~ Discovery_clean, scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
          legend.position = "top",
          legend.key.spacing.x = unit(1, "cm"),
          legend.justification = "center")
dev.off()

# Plot performance of -multi models trained using LEOPARD vs using indiv-level data
tmp <- meta_res_eval
tmp <- tmp[tmp$Target != 'EUR',]
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method')
tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods)] <- paste0(tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods)], '-multi')
tmp$label <- factor(tmp$label, levels = unique(tmp$label[order(!(grepl('Multi', tmp$label)), tmp$label)]))
tmp<-tmp[grepl('multi', tmp$label),]
tmp <- tmp[tmp$Model != 'Multi-IndivTune',]
tmp$Model<-as.character(tmp$Model)
tmp$Model[tmp$Model != 'SumStatTune']<-'IndivTune'
tmp$Model[tmp$Model == 'SumStatTune']<-'LEOPARD'
tmp$Target <- paste0(tmp$Target, ' Target')

png(paste0('~/oliverpainfel/Analyses/crosspop/plots/average_r_leopard.png'), res=300, width = 1500, height = 2000, units = 'px')
ggplot(tmp, aes(x=label, y=R , fill = Model)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x='Method') +
    facet_grid(Target ~., scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
          legend.position = "top",
          legend.key.spacing.x = unit(1, "cm"),
          legend.justification = "center")
dev.off()

# Make simplified plot
# Just show performance when using IndivTrain (or SumStat), and Remove 'All' model, with both GWAS.
tmp <- meta_res_eval
tmp <- tmp[tmp$Target != 'EUR',]
tmp <- tmp[tmp$Method != 'all',]
tmp <- tmp[tmp$Source  == 'Multi',]
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x = T)
tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'] <- paste0(tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'], '-multi')
tmp$label <- factor(tmp$label, levels = model_order)
tmp$Discovery_clean <- as.character(tmp$Discovery)
tmp$Discovery_clean[tmp$Discovery == 'EUR'] <- 'EUR GWAS'
tmp$Discovery_clean[tmp$Discovery != 'EUR' & tmp$Source == 'Single'] <- 'Target-matched GWAS'
tmp$Discovery_clean[tmp$Discovery != 'EUR' & tmp$Source == 'Multi'] <- 'Both'
tmp$Discovery_clean <- factor(tmp$Discovery_clean, 
                              levels = c('Target-matched GWAS',
                                         'EUR GWAS',
                                         'Both'))
tmp$Target <- paste0(tmp$Target, ' Target')
tmp$Model[tmp$Model != 'SumStatTune'] <- 'IndivTune'
tmp$Model[tmp$Model == 'SumStatTune'] <- 'SumStatTune'
tmp <- tmp[!duplicated(tmp[, c('label','Target','Discovery_clean','Model'), with=F]),]
tmp<-tmp[tmp$Model == 'IndivTune',]

png(paste0('~/oliverpainfel/Analyses/crosspop/plots/average_r_simple.png'), res=300, width = 3200, height = 2000, units = 'px')
ggplot(tmp, aes(x=label, y=R)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23, fill = 'black') +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x='Method') +
    facet_grid(Target ~ ., scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
dev.off()

tmp<-tmp[tmp$Method %in% c('ldpred2','prscsx','xwing'),]
png(paste0('~/oliverpainfel/Analyses/crosspop/plots/average_r_simple_ldpred2.png'), res=300, width = 500, height = 500, units = 'px')
ggplot(tmp, aes(x=label, y=R)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
  #  geom_point(stat="identity", position=position_dodge(1), fill = '#3399FF') +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23, fill = '#3399FF') +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x='Method') +
    facet_grid(Target ~ ., scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
dev.off()


####
# Create heatmap showing difference between all methods and models
####

# Create a function to mirror pred_comp results
mirror_comp<-function(x){
  x_sym <- x
  x_sym$Model_1 <- x$Model_2
  x_sym$Model_2 <- x$Model_1
  x_sym$Model_1_R <- x$Model_2_R
  x_sym$Model_2_R <- x$Model_1_R
  x_sym$R_diff <- -x_sym$R_diff
  x_mirrored <- rbind(x, x_sym)
  x_diag<-data.frame(
      Model_1=unique(x_mirrored$Model_1),
      Model_2=unique(x_mirrored$Model_1),
      Model_1_R=x_mirrored$Model_1_R,
      Model_2_R=x_mirrored$Model_1_R,
      R_diff=NA,
      R_diff_pval=NA
    )
  x_comp<-rbind(x_mirrored, x_diag)
  return(x_comp)
}
  
# Read in results
targ_pop=c('EUR','EAS','AFR')
res_comp <- list()
for(pheno_i in selected_traits){
  res_comp_i<-NULL
  for(targ_pop_i in targ_pop){
    if(targ_pop_i == 'EAS'){
      disc_pop <- 'EAS'
    }
    if(targ_pop_i == 'AFR'){
      disc_pop <- 'AFR'
    }
    if(targ_pop_i == 'EUR'){
      disc_pop <- c('EAS','AFR')
    }
    for(disc_pop_i in disc_pop){
      comp_i <-
        fread(
          paste0(
            '/users/k1806347/oliverpainfel/Analyses/crosspop/',
            'targ_',
            targ_pop_i,
            '.disc_EUR_',
            disc_pop_i,
            '/',
            pheno_i,
            '/res.pred_comp.txt'
          )
        )
      comp_i<-mirror_comp(comp_i)
      comp_i$Target<-targ_pop_i
      comp_i$gwas_group<-paste0('EUR+', disc_pop_i)
      res_comp_i<-rbind(res_comp_i, comp_i)
    }
  }
  
  res_comp[[pheno_i]]<-res_comp_i
}

res_comp_all <- do.call(rbind, lapply(names(res_comp), function(name) {
  x <- res_comp[[name]]
  x$pheno <- name  # Add a new column with the name of the element
  x  # Return the updated dataframe
}))

# Annotate tests to get order correct
res_comp_all$Method1<-sub('\\..*','',res_comp_all$Model_1)
res_comp_all$Method1<-gsub('-.*','', res_comp_all$Method1)
res_comp_all$Method2<-sub('\\..*','',res_comp_all$Model_2)
res_comp_all$Method2<-gsub('-.*','', res_comp_all$Method2)

find_model<-function(x){
  mod <- x
  mod[grepl('top1$', x) & !grepl('pseudo', x)] <- 'IndivTune'
  mod[grepl('top1$', x) & grepl('pseudo', x)] <- 'SumStatTune'
  mod[grepl('multi$', x) & !grepl('pseudo', x)] <- 'Multi-IndivTune'
  mod[grepl('multi$', x) & grepl('pseudo', x)] <- 'Multi-SumStatTune'
  mod[grepl('_multi', x)] <- 'SumStatTune'
  mod[x == 'prscsx.pseudo.multi'] <- 'SumStatTune'
  mod[x == 'xwing.pseudo.multi'] <- 'SumStatTune'
  
  return(mod)
}

res_comp_all$Model1<-find_model(res_comp_all$Model_1)
res_comp_all$Model2<-find_model(res_comp_all$Model_2)

res_comp_all$Source1<-ifelse(res_comp_all$Method1 %in% pgs_group_methods | grepl('_multi$', res_comp_all$Method1) | !grepl('AFR|EAS|EUR', res_comp_all$Model_1), 'Multi', 'Single')
res_comp_all$Source2<-ifelse(res_comp_all$Method2 %in% pgs_group_methods | grepl('_multi$', res_comp_all$Method2) | !grepl('AFR|EAS|EUR', res_comp_all$Model_2), 'Multi', 'Single')
  
for(i in c('EUR','EAS','AFR')){
  res_comp_all$Discovery1[grepl(i, res_comp_all$Model_1)] <- i
  res_comp_all$Discovery2[grepl(i, res_comp_all$Model_2)] <- i
}
res_comp_all$Discovery1[res_comp_all$Source1 == 'Multi'] <- res_comp_all$gwas_group[res_comp_all$Source1 == 'Multi']
res_comp_all$Discovery2[res_comp_all$Source2 == 'Multi'] <- res_comp_all$gwas_group[res_comp_all$Source2 == 'Multi']

res_comp_all$Method1<-factor(res_comp_all$Method1, levels=unique(res_comp_all$Method1))
res_comp_all$Method2<-factor(res_comp_all$Method2, levels=unique(res_comp_all$Method2))
res_comp_all$Model1<-factor(res_comp_all$Model1, levels=c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
res_comp_all$Model2<-factor(res_comp_all$Model2, levels=c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
res_comp_all$Discovery1<-factor(res_comp_all$Discovery1, levels=rev(c('AFR','EAS','EUR','EUR+AFR','EUR+EAS')))
res_comp_all$Discovery2<-factor(res_comp_all$Discovery2, levels=c('AFR','EAS','EUR','EUR+AFR','EUR+EAS'))

# Remove IndivTune and Multi-IndivTune model for groups that contain one score (aka QuickPRS and SBayesRC)
res_comp_all <- res_comp_all[
!(res_comp_all$Method1 %in%  c('quickprs','sbayesrc') & 
  res_comp_all$Model1 %in% c('IndivTune','Multi-IndivTune')),]
res_comp_all <- res_comp_all[
!(res_comp_all$Method2 %in%  c('quickprs','sbayesrc') & 
  res_comp_all$Model2 %in% c('IndivTune','Multi-IndivTune')),]

# Remove pseudo model for methods that don't really have one 
res_comp_all <- res_comp_all[
!(res_comp_all$Method1 %in%  c('ptclump','ptclump_multi') & 
  res_comp_all$Model1 %in% c('SumStatTune','Multi-SumStatTune')),]
res_comp_all <- res_comp_all[
!(res_comp_all$Method2 %in%  c('ptclump','ptclump_multi') & 
  res_comp_all$Model2 %in% c('SumStatTune','Multi-SumStatTune')),]

# Remove top1 models for PRS-CSx
res_comp_all <- res_comp_all[
!(grepl('prscsx|xwing|_multi', res_comp_all$Method1) & 
  grepl('top1', res_comp_all$Model_1)),]
res_comp_all <- res_comp_all[
!(grepl('prscsx|xwing|_multi', res_comp_all$Method2) & 
  grepl('top1', res_comp_all$Model_2)),]

# Remove any comparisons
res_comp_all <- res_comp_all[!duplicated(res_comp_all[, c("Target", "gwas_group", "Method1", "Model1", "Source1", "Discovery1", "Method2", "Model2", "Source2", "Discovery2",'pheno')]),]

res_comp_all$r_diff_rel <- res_comp_all$R_diff / res_comp_all$Model_2_R

# Calculate relative improvement for ldpred2-multi vs ldpred2 as example
tmp_ldpred2 <- res_comp_all[res_comp_all$Model_1 == 'ldpred2.multi' & 
                    grepl('ldpred2-', res_comp_all$Model_2) &
                    res_comp_all$Target == 'AFR',]
tmp_ldpred2 <- tmp_ldpred2[order(-tmp_ldpred2$Model_2_R),]
tmp_ldpred2 <- tmp_ldpred2[!duplicated(tmp_ldpred2$pheno),]
round(min(tmp_ldpred2$r_diff_rel)*100, 1)
round(max(tmp_ldpred2$r_diff_rel)*100, 1)

tmp_ldpred2 <- res_comp_all[res_comp_all$Model_1 == 'ldpred2.multi' & 
                    grepl('ldpred2-', res_comp_all$Model_2) &
                    res_comp_all$Target == 'EAS',]
tmp_ldpred2 <- tmp_ldpred2[order(-tmp_ldpred2$Model_2_R),]
tmp_ldpred2 <- tmp_ldpred2[!duplicated(tmp_ldpred2$pheno),]
round(min(tmp_ldpred2$r_diff_rel)*100, 1)
round(max(tmp_ldpred2$r_diff_rel)*100, 1)

# Calculate relative improvement for sbayesrc-multi vs sbayesrc in EUR target as example
tmp_sbayesrc <- res_comp_all[res_comp_all$Model_1 == 'sbayesrc.pseudo.multi' & 
                    grepl('sbayesrc.pseudo-', res_comp_all$Model_2) &
                    res_comp_all$Target == 'EUR' &
                    res_comp_all$Discovery1 == 'EUR+EAS' &
                    res_comp_all$Discovery2 == 'EUR',]
tmp_sbayesrc <- tmp_sbayesrc[order(-tmp_sbayesrc$Model_2_R),]
round(min(tmp_sbayesrc$r_diff_rel)*100, 1)
round(max(tmp_sbayesrc$r_diff_rel)*100, 1)

tmp_sbayesrc <- res_comp_all[res_comp_all$Model_1 == 'sbayesrc.pseudo.multi' & 
                    grepl('sbayesrc.pseudo-', res_comp_all$Model_2) &
                    res_comp_all$Target == 'EUR' &
                    res_comp_all$Discovery1 == 'EUR+AFR' &
                    res_comp_all$Discovery2 == 'EUR',]
tmp_sbayesrc <- tmp_sbayesrc[order(-tmp_sbayesrc$Model_2_R),]
round(min(tmp_sbayesrc$r_diff_rel)*100, 1)
round(max(tmp_sbayesrc$r_diff_rel)*100, 1)

#####
# Export a csv containing difference results for all traits
#####
# Simplify to contain only IndivTune or SumStatTune result
tmp <- res_comp_all
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method1', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
names(tmp)[names(tmp) == 'label'] <- 'label1'
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method2', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
names(tmp)[names(tmp) == 'label'] <- 'label2'

tmp$label1[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label1 != 'All'] <- paste0(tmp$label1[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label1 != 'All'], '-multi')
tmp$label2[grepl('Multi', tmp$Model2) & !(tmp$Method2 %in% pgs_group_methods) & tmp$label2 != 'All'] <- paste0(tmp$label2[grepl('Multi', tmp$Model2) & !(tmp$Method2 %in% pgs_group_methods) & tmp$label2 != 'All'], '-multi')

tmp$Model1[tmp$Model1 != 'SumStatTune'] <- 'IndivTune'
tmp$Model1[tmp$Model1 == 'SumStatTune'] <- 'SumStatTune'
tmp$Model2[tmp$Model2 != 'SumStatTune'] <- 'IndivTune'
tmp$Model2[tmp$Model2 == 'SumStatTune'] <- 'SumStatTune'

tmp<-tmp[tmp$Model_1 %in% res_eval_simp$Group,]
tmp<-tmp[tmp$Model_2 %in% res_eval_simp$Group,]

tmp$`Model 1` <- paste0(tmp$label1, ' - ', tmp$Model1, ' - ', tmp$Discovery1)
tmp$`Model 2` <- paste0(tmp$label2, ' - ', tmp$Model2, ' - ', tmp$Discovery2)

tmp <- tmp[, c('Target', 'pheno', 'Model 1', 'Model 2', 'Model_1_R', 'Model_2_R', 'R_diff', 'R_diff_pval'), with=F]
names(tmp) <- c('Target', 'Trait','Model 1', 'Model 2', "R (Model 1)", "R (Model 2)", "R difference (Model 1 R - Model 2 R)", "R difference p-value")

tmp<-tmp[order(tmp$Target, tmp$Trait, tmp$`Model 1`, tmp$`Model 2`),]
tmp$`R difference (Model 1 R - Model 2 R)` <- round(tmp$`R difference (Model 1 R - Model 2 R)`, 3)
tmp$`R (Model 1)` <- round(tmp$`R (Model 1)`, 3)
tmp$`R (Model 2)` <- round(tmp$`R (Model 2)`, 3)

write.csv(tmp, '~/oliverpainfel/Analyses/crosspop/r_diff.csv', row.names=F)

###########

library(MAd)

# Average R across phenotypes
meta_res_comp <- NULL
for(targ_pop_i in targ_pop){
  if(targ_pop_i == 'EAS'){
    disc_pop <- 'EAS'
  }
  if(targ_pop_i == 'AFR'){
    disc_pop <- 'AFR'
  }
  if(targ_pop_i == 'EUR'){
    disc_pop <- c('EAS','AFR')
  }
  for(disc_pop_i in disc_pop){
  
    # Subset res_comp for each scenario
    res_comp_i <- res_comp_all[res_comp_all$Target == targ_pop_i & res_comp_all$gwas_group == paste0('EUR+', disc_pop_i)]
  
    # Calculate diff SE based on p-value
    res_comp_i$R_diff_pval[res_comp_i$R_diff == 0] <- 1-0.001
    res_comp_i$R_diff_pval[res_comp_i$R_diff_pval == 1]<-1-0.001
    res_comp_i$R_diff_z<-qnorm(res_comp_i$R_diff_pval/2)
    res_comp_i$R_diff_SE<-abs(res_comp_i$R_diff/res_comp_i$R_diff_z)
        
    # Average results for each test across phenotypes
    # Use MAd to account for correlation between them
    res_comp_i$Sample<-'A'
    res_comp_i$Group <- paste0(res_comp_i$Model_1, '_vs_', res_comp_i$Model_2)
  
    for(group_i in unique(res_comp_i$Group)){
      res_comp_group_i <- res_comp_i[res_comp_i$Group == group_i,]
      cors_i <- cors[[targ_pop_i]][unique(res_comp_group_i$pheno), unique(res_comp_group_i$pheno)]
      
      if(res_comp_group_i$Model_1[1] != res_comp_group_i$Model_2[1]){
        
        meta_res_comp_i <-
          agg(
            id = Sample,
            es = R_diff,
            var = R_diff_SE ^ 2,
            cor = cors_i,
            method = "BHHR",
            mod = NULL,
            data = res_comp_group_i
          )
        
        tmp <- res_comp_group_i[1,]
        tmp$pheno <- NULL
        tmp$Model_1_R <-
          meta_res_eval$R[meta_res_eval$Group == tmp$Model_1 &
                            meta_res_eval$Target == targ_pop_i &
                            meta_res_eval$gwas_group == paste0('EUR+', disc_pop_i)]
        tmp$Model_2_R <-
          meta_res_eval$R[meta_res_eval$Group == tmp$Model_2 &
                            meta_res_eval$Target == targ_pop_i &
                            meta_res_eval$gwas_group == paste0('EUR+', disc_pop_i)]
        tmp$R_diff <- meta_res_comp_i$es
        tmp$R_diff_SE <- sqrt(meta_res_comp_i$var)
        tmp$R_diff_z <- tmp$R_diff / tmp$R_diff_SE
        tmp$R_diff_p <- 2*pnorm(-abs(tmp$R_diff_z))
      } else {
        tmp <- res_comp_group_i[1,]
        tmp$pheno <- NULL
        tmp$R_diff <- NA
        tmp$R_diff_SE <- NA
        tmp$R_diff_z <- NA
        tmp$R_diff_p <- NA
      }
      meta_res_comp <- rbind(meta_res_comp, tmp)
    }
  }
}

meta_res_comp$R_diff_perc <- meta_res_comp$R_diff / meta_res_comp$Model_2_R
  
# Extract average improvement for ldpred2-multi vs ldpred2 as example
tmp_ldpred2 <- meta_res_comp[meta_res_comp$Model_1 == 'ldpred2.multi' & 
                    grepl('ldpred2-', meta_res_comp$Model_2) &
                    meta_res_comp$Target == 'AFR',]
tmp_ldpred2 <- tmp_ldpred2[order(-tmp_ldpred2$Model_2_R),]
round(min(tmp_ldpred2$R_diff_perc)*100, 1)

tmp_ldpred2 <- meta_res_comp[meta_res_comp$Model_1 == 'ldpred2.multi' & 
                    grepl('ldpred2-', meta_res_comp$Model_2) &
                    meta_res_comp$Target == 'EAS',]
tmp_ldpred2 <- tmp_ldpred2[order(-tmp_ldpred2$Model_2_R),]
round(min(tmp_ldpred2$R_diff_perc)*100, 1)

# Extract average improvement for sbayesrc-multi vs sbayesrc in EUR as example
tmp_sbayesrc <- meta_res_comp[meta_res_comp$Model_1 == 'sbayesrc.pseudo.multi' & 
                    grepl('sbayesrc.pseudo', meta_res_comp$Model_2) &
                    meta_res_comp$Target == 'EUR' &
                    meta_res_comp$Discovery1 == 'EUR+AFR' &
                    meta_res_comp$Discovery2 == 'EUR',]
round(tmp_sbayesrc$R_diff_perc*100, 1)

tmp_sbayesrc <- meta_res_comp[meta_res_comp$Model_1 == 'sbayesrc.pseudo.multi' & 
                    grepl('sbayesrc.pseudo', meta_res_comp$Model_2) &
                    meta_res_comp$Target == 'EUR' &
                    meta_res_comp$Discovery1 == 'EUR+EAS' &
                    meta_res_comp$Discovery2 == 'EUR',]
round(tmp_sbayesrc$R_diff_perc*100, 1)

# Extract average improvement for sbayesrc in EUR compared to all model
tmp_sbayesrc <- meta_res_comp[meta_res_comp$Model_2 == 'sbayesrc.pseudo-EUR.top1' &
                    meta_res_comp$Model_1 == 'all-EUR.top1' &
                    meta_res_comp$Target == 'AFR',]
round(tmp_sbayesrc$R_diff_perc*100, 1)
tmp_sbayesrc$R_diff_p

tmp_sbayesrc <- meta_res_comp[meta_res_comp$Model_2 == 'sbayesrc.pseudo-EUR.top1' &
                    meta_res_comp$Model_1 == 'all-EUR.top1' &
                    meta_res_comp$Target == 'EAS',]
round(tmp_sbayesrc$R_diff_perc*100, 1)
tmp_sbayesrc$R_diff_p


# Compare QuickPRS-Multi vs QuickPRS to evaluate LEOPARD performance
tmp_quickprs <- meta_res_comp[meta_res_comp$Model_1 == 'quickprs_multi.pseudo.multi' & 
                                meta_res_comp$Model_2 == 'quickprs.pseudo.multi' &
                    meta_res_comp$Target == 'AFR',]
round(min(tmp_quickprs$R_diff_perc)*100, 1)

tmp_quickprs <- meta_res_comp[meta_res_comp$Model_1 == 'quickprs_multi.pseudo.multi' & 
                                meta_res_comp$Model_2 == 'quickprs.pseudo.multi' &
                    meta_res_comp$Target == 'EAS',]
round(min(tmp_quickprs$R_diff_perc)*100, 1)

# Compare all.multi method to next best method
tmp_all <- meta_res_comp[meta_res_comp$Model_1 == 'all.multi' &
                    meta_res_comp$Target == 'AFR' &
                    meta_res_comp$Source2 == 'Multi',]
tmp_all <- tmp_all[order(tmp_all$R_diff),]
tmp_all <- tmp_all[1,]
round(tmp_all$R_diff_perc*100, 1)
tmp_all$R_diff_p

tmp_all <- meta_res_comp[meta_res_comp$Model_1 == 'all.multi' &
                    meta_res_comp$Target == 'EAS' &
                    meta_res_comp$Source2 == 'Multi',]
tmp_all <- tmp_all[order(tmp_all$R_diff),]
tmp_all <- tmp_all[1,]
round(tmp_all$R_diff_perc*100, 1)
tmp_all$R_diff_p

# Compare all.multi method to next best method
tmp_all <- meta_res_comp[meta_res_comp$Model_1 == 'all-AFR.top1' &
                    meta_res_comp$Target == 'AFR' &
                    meta_res_comp$Discovery1 == 'AFR' &
                    meta_res_comp$Discovery2 == 'AFR',]
tmp_all <- tmp_all[order(tmp_all$R_diff),]
tmp_all <- tmp_all[1,]
round(tmp_all$R_diff_perc*100, 1)
tmp_all$R_diff_p

tmp_all <- meta_res_comp[meta_res_comp$Model_1 == 'all-EAS.top1' &
                    meta_res_comp$Target == 'EAS' &
                    meta_res_comp$Discovery1 == 'EAS' &
                    meta_res_comp$Discovery2 == 'EAS',]
tmp_all <- tmp_all[order(tmp_all$R_diff),]
tmp_all <- tmp_all[1,]
round(tmp_all$R_diff_perc*100, 1)
tmp_all$R_diff_p

#####
# Export a csv containing difference results for all traits
#####
# Simplify to contain only IndivTune or SumStatTune result
tmp <- meta_res_comp
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method1', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
names(tmp)[names(tmp) == 'label'] <- 'label1'
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method2', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
names(tmp)[names(tmp) == 'label'] <- 'label2'

tmp$label1[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label1 != 'All'] <- paste0(tmp$label1[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label1 != 'All'], '-multi')
tmp$label2[grepl('Multi', tmp$Model2) & !(tmp$Method2 %in% pgs_group_methods) & tmp$label2 != 'All'] <- paste0(tmp$label2[grepl('Multi', tmp$Model2) & !(tmp$Method2 %in% pgs_group_methods) & tmp$label2 != 'All'], '-multi')

tmp$Model1[tmp$Model1 != 'SumStatTune'] <- 'IndivTune'
tmp$Model1[tmp$Model1 == 'SumStatTune'] <- 'SumStatTune'
tmp$Model2[tmp$Model2 != 'SumStatTune'] <- 'IndivTune'
tmp$Model2[tmp$Model2 == 'SumStatTune'] <- 'SumStatTune'

tmp<-tmp[tmp$Model_1 %in% res_eval_simp$Group,]
tmp<-tmp[tmp$Model_2 %in% res_eval_simp$Group,]

tmp$`Model 1` <- paste0(tmp$label1, ' - ', tmp$Model1, ' - ', tmp$Discovery1)
tmp$`Model 2` <- paste0(tmp$label2, ' - ', tmp$Model2, ' - ', tmp$Discovery2)

tmp$`Percentage change (R difference / Model 2 R)` <- paste0(round(tmp$R_diff_perc * 100, 1), '%')

tmp <- tmp[, c('Target', 'Model 1', 'Model 2', 'Model_1_R', 'Model_2_R', 'R_diff',"Percentage change (R difference / Model 2 R)", 'R_diff_p'), with=F]
names(tmp) <- c('Target','Model 1', 'Model 2', "R (Model 1)", "R (Model 2)", "R difference (Model 1 R - Model 2 R)", "Percentage change (R difference / Model 2 R)", "R difference p-value")

tmp<-tmp[order(tmp$Target, tmp$`Model 1`, tmp$`Model 2`),]
tmp$`R difference (Model 1 R - Model 2 R)` <- round(tmp$`R difference (Model 1 R - Model 2 R)`, 3)
tmp$`R (Model 1)` <- round(tmp$`R (Model 1)`, 3)
tmp$`R (Model 2)` <- round(tmp$`R (Model 2)`, 3)

write.csv(tmp, '~/oliverpainfel/Analyses/crosspop/r_diff_average.csv', row.names=F)

############

# Group differences
meta_res_comp$R_diff_catagory <- cut(
    meta_res_comp$R_diff,
    breaks = c(-Inf, -0.08, -0.025, -0.002, 0.002, 0.025, 0.08, Inf),
    labels = c('< -0.08', '-0.08 - -0.025', '-0.025 - -0.002', '-0.002 - 0.002', '0.002 - 0.025', '0.025 - 0.08', '> 0.08'),
    right = FALSE
)
meta_res_comp$R_diff_catagory <- factor(meta_res_comp$R_diff_catagory, levels = rev(levels(meta_res_comp$R_diff_catagory)))

# Assign significance stars
meta_res_comp$indep_star<-' '
meta_res_comp$indep_star[meta_res_comp$R_diff_p < 0.05]<-'*'
meta_res_comp$indep_star[meta_res_comp$R_diff_p < 1e-3]<-'**'
# meta_res_comp$indep_star[meta_res_comp$R_diff_p < 1e-6]<-'***'

meta_res_comp<-meta_res_comp[order(meta_res_comp$Discovery1, meta_res_comp$Discovery2, meta_res_comp$Method1),]

for(targ_pop_i in targ_pop){
  if(targ_pop_i == 'EAS'){
    disc_pop <- 'EAS'
  }
  if(targ_pop_i == 'AFR'){
    disc_pop <- 'AFR'
  }
  if(targ_pop_i == 'EUR'){
    disc_pop <- c('EAS','AFR')
  }
  for(disc_pop_i in disc_pop){

    tmp <- meta_res_comp[meta_res_comp$Target == targ_pop_i, ]

    tmp <- merge(tmp, pgs_method_labels, by.x = 'Method1', by.y = 'method', all.x = T)
    tmp$label[is.na(tmp$label)] <- 'All'
    names(tmp)[names(tmp) == 'label'] <- 'label1'
    tmp <- merge(tmp, pgs_method_labels, by.x = 'Method2', by.y = 'method', all.x = T)
    tmp$label[is.na(tmp$label)] <- 'All'
    names(tmp)[names(tmp) == 'label'] <- 'label2'
    
    tmp$label1[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label1 != 'All'] <- paste0(tmp$label1[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label1 != 'All'], '-multi')
    tmp$label2[grepl('Multi', tmp$Model2) & !(tmp$Method2 %in% pgs_group_methods) & tmp$label2 != 'All'] <- paste0(tmp$label2[grepl('Multi', tmp$Model2) & !(tmp$Method2 %in% pgs_group_methods) & tmp$label2 != 'All'], '-multi')
    
    tmp$Model1[tmp$Model1 != 'SumStatTune'] <- 'IndivTune'
    tmp$Model1[tmp$Model1 == 'SumStatTune'] <- 'SumStatTune'
    tmp$Model2[tmp$Model2 != 'SumStatTune'] <- 'IndivTune'
    tmp$Model2[tmp$Model2 == 'SumStatTune'] <- 'SumStatTune'
    
    tmp<-tmp[tmp$Model_1 %in% res_eval_simp$Group,]
    tmp<-tmp[tmp$Model_2 %in% res_eval_simp$Group,]

    tmp$label1 <- factor(tmp$label1, levels = model_order)
    tmp$label2 <- factor(tmp$label2, levels = model_order)

    tmp<-tmp[order(tmp$label1, tmp$label2),]
    
    tmp$label1 <- paste0(tmp$label1," (", ifelse(tmp$Model1 == 'SumStatTune', 'ST', 'IT'), ")")
    tmp$label2 <- paste0(tmp$label2," (", ifelse(tmp$Model2 == 'SumStatTune', 'ST', 'IT'), ")")

    tmp$label1 <- factor(tmp$label1, levels = unique(tmp$label1))
    tmp$label2 <- factor(tmp$label2, levels = unique(tmp$label2))
    
    tmp <- tmp[tmp$gwas_group == paste0('EUR+', disc_pop_i), ]
    
    plot_tmp <- ggplot(data = tmp, aes(label2, label1, fill = R_diff_catagory)) +
      geom_tile(color = "white", show.legend = TRUE) +
      labs(y = 'Test', x = 'Comparison', fill = 'R difference', title = paste0('Target: ', targ_pop_i)) +
      facet_grid(Discovery1 ~ Discovery2, scales = 'free', space = 'free', switch="both") +
      geom_text(
        data = tmp,
        aes(label2, label1, label = indep_star),
        color = "black",
        size = 4,
        angle = 0,
        vjust = 0.8
      ) +
      scale_fill_brewer(
        breaks = levels(tmp$R_diff_catagory),
        palette = "RdBu",
        drop = F,
        na.value = 'grey'
      ) +
      theme_half_open() +
      background_grid() +
      panel_border() +
      theme(axis.text.x = element_text(
        angle = 45,
        vjust = 1,
        hjust = 1
      ))
    
    png(paste0('~/oliverpainfel/Analyses/crosspop/plots/average_r_diff.Discovery_EUR_', disc_pop_i,'.Target_', targ_pop_i, '.png'), res=300, width = 4400, height = 3200, units = 'px')
      print(plot_tmp)
    dev.off()
  }
}

####
# Plot relative improvement of methods
####
# Use ptclump IndivTune using EUR GWAS as the reference, as provides an interpretable scale

meta_res_comp_ptclump_top1<-meta_res_comp[meta_res_comp$Method2 == 'all' & meta_res_comp$Source2 == 'Multi',]
meta_res_comp_ptclump_top1$reference_point<-F
meta_res_comp_ptclump_top1$reference_point[meta_res_comp_ptclump_top1$Method1 == 'all' & meta_res_comp_ptclump_top1$Source1 == 'Multi']<-T
meta_res_comp_ptclump_top1$R_diff[is.na(meta_res_comp_ptclump_top1$R_diff)]<-0
meta_res_comp_ptclump_top1$Discovery1 <- factor(meta_res_comp_ptclump_top1$Discovery1, levels=rev(levels(meta_res_comp_ptclump_top1$Discovery1)))

res_comp_all_ptclump_top1<-res_comp_all[res_comp_all$Method2 == 'all' & res_comp_all$Source2 == 'Multi',]
res_comp_all_ptclump_top1$Discovery1 <-  factor(res_comp_all_ptclump_top1$Discovery1, levels=levels(meta_res_comp_ptclump_top1$Discovery1))

# Create data to plot reference points
meta_res_comp_reference <- meta_res_comp_ptclump_top1
meta_res_comp_reference$R_diff[meta_res_comp_ptclump_top1$reference_point == F] <- NA
meta_res_comp_reference$R_diff_SE [meta_res_comp_ptclump_top1$reference_point == F] <- NA
res_comp_all_ptclump_top1$reference_point<-F

meta_tmp <- meta_res_comp_ptclump_top1
meta_tmp <- merge(meta_tmp, pgs_method_labels, by.x = 'Method1', by.y = 'method', all.x = T)
meta_tmp$label[is.na(meta_tmp$label)] <- 'All'
meta_tmp$label[grepl('Multi', meta_tmp$Model1) & !(meta_tmp$Method1 %in% pgs_group_methods) & meta_tmp$label != 'All'] <- paste0(meta_tmp$label[grepl('Multi', meta_tmp$Model1) & !(meta_tmp$Method1 %in% pgs_group_methods) & meta_tmp$label != 'All'], '-multi')
meta_tmp$label <- factor(meta_tmp$label, levels = model_order)
meta_tmp$Discovery_clean <- as.character(meta_tmp$Discovery1)
meta_tmp$Discovery_clean[meta_tmp$Discovery1 == 'EUR'] <- 'EUR GWAS'
meta_tmp$Discovery_clean[meta_tmp$Discovery1 != 'EUR' & meta_tmp$Source1 == 'Single'] <- 'Target-matched GWAS'
meta_tmp$Discovery_clean[meta_tmp$Discovery1 != 'EUR' & meta_tmp$Source1 == 'Multi'] <- 'Both'
meta_tmp$Discovery_clean <- factor(meta_tmp$Discovery_clean, 
                              levels = c('Target-matched GWAS',
                                         'EUR GWAS',
                                         'Both'))
meta_tmp$Target <- paste0(meta_tmp$Target, ' Target')
meta_tmp$Model1 <- factor(meta_tmp$Model1, levels = c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))

meta_tmp_ref <- meta_res_comp_reference
meta_tmp_ref <- merge(meta_tmp_ref, pgs_method_labels, by.x = 'Method1', by.y = 'method', all.x = T)
meta_tmp_ref$label[is.na(meta_tmp_ref$label)] <- 'All'
meta_tmp_ref$label[grepl('Multi', meta_tmp_ref$Model1) & !(meta_tmp_ref$Method1 %in% pgs_group_methods) & meta_tmp_ref$label != 'All'] <- paste0(meta_tmp_ref$label[grepl('Multi', meta_tmp_ref$Model1) & !(meta_tmp_ref$Method1 %in% pgs_group_methods) & meta_tmp_ref$label != 'All'], '-multi')
meta_tmp_ref$label <- factor(meta_tmp_ref$label, levels = model_order)
meta_tmp_ref$Discovery_clean <- as.character(meta_tmp_ref$Discovery1)
meta_tmp_ref$Discovery_clean[meta_tmp_ref$Discovery1 == 'EUR'] <- 'EUR GWAS'
meta_tmp_ref$Discovery_clean[meta_tmp_ref$Discovery1 != 'EUR' & meta_tmp_ref$Source1 == 'Single'] <- 'Target-matched GWAS'
meta_tmp_ref$Discovery_clean[meta_tmp_ref$Discovery1 != 'EUR' & meta_tmp_ref$Source1 == 'Multi'] <- 'Both'
meta_tmp_ref$Discovery_clean <- factor(meta_tmp_ref$Discovery_clean, 
                              levels = c('Target-matched GWAS',
                                         'EUR GWAS',
                                         'Both'))
meta_tmp_ref$Target <- paste0(meta_tmp_ref$Target, ' Target')
meta_tmp_ref$Model1 <- factor(meta_tmp_ref$Model1, levels = c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))

tmp <- res_comp_all_ptclump_top1
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method1', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
tmp$label[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label != 'All'] <- paste0(tmp$label[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label != 'All'], '-multi')
tmp$label <- factor(tmp$label, levels = model_order)
tmp$Discovery_clean <- as.character(tmp$Discovery1)
tmp$Discovery_clean[tmp$Discovery1 == 'EUR'] <- 'EUR GWAS'
tmp$Discovery_clean[tmp$Discovery1 != 'EUR' & tmp$Source1 == 'Single'] <- 'Target-matched GWAS'
tmp$Discovery_clean[tmp$Discovery1 != 'EUR' & tmp$Source1 == 'Multi'] <- 'Both'
tmp$Discovery_clean <- factor(tmp$Discovery_clean, 
                              levels = c('Target-matched GWAS',
                                         'EUR GWAS',
                                         'Both'))
tmp$Target <- paste0(tmp$Target, ' Target')
tmp$Model1 <- factor(tmp$Model1, levels = c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))

ggplot(meta_tmp, aes(x=label, y=R_diff , fill = Model1)) +
    geom_point(
        data = tmp,
        mapping = aes(x=label, y=R_diff, colour=Model1),
        position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.7),
        alpha = 0.3
      ) +
    geom_hline(yintercept = 0) +
      geom_errorbar(
        aes(
          ymin = R_diff - R_diff_SE,
          ymax = R_diff + R_diff_SE
        ),
        width = 0,
        position = position_dodge(width = 0.7)
      ) +    
    geom_point(
      stat = "identity",
      position = position_dodge(0.7),
      size = 3,
      shape = 23
    ) +
    geom_point(
        data = meta_tmp_ref,
        aes(x = label, y = R_diff, fill = Model1),
        stat = "identity",
        position = position_dodge(0.7), # Ensure same dodge as other points
        size = 3,    # Increase size for emphasis
        shape = 22,
        stroke = 1.5,
        show.legend=F
    ) +
    geom_vline(xintercept = seq(1.5, length(unique(meta_tmp$label))), linetype="dotted") +
    labs(y = "R_diff (SE)") +
    facet_grid(Target ~ Discovery_clean, scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid() + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))


# Plot as % change
meta_tmp$R_diff_perc <- meta_tmp$R_diff / meta_tmp$Model_2_R
meta_tmp_ref$R_diff_perc <- meta_tmp_ref$R_diff / meta_tmp_ref$Model_2_R
tmp$R_diff_perc <- tmp$R_diff / tmp$Model_2_R

meta_tmp$R_diff_perc_SE <- meta_tmp$R_diff_SE / meta_tmp$Model_2_R

library(scales)
ggplot(meta_tmp, aes(x=label, y=R_diff_perc , fill = Model1)) +
    geom_point(
        data = tmp,
        mapping = aes(x=label, y=R_diff_perc, colour=Model1),
        position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.7),
        alpha = 0.3
      ) +
    geom_hline(yintercept = 0) +
      geom_errorbar(
        aes(
          ymin = R_diff_perc - R_diff_perc_SE,
          ymax = R_diff_perc + R_diff_perc_SE
        ),
        width = 0,
        position = position_dodge(width = 0.7)
      ) +    
    geom_point(
      stat = "identity",
      position = position_dodge(0.7),
      size = 3,
      shape = 23
    ) +
    geom_point(
        data = meta_tmp_ref,
        aes(x = label, y = R_diff_perc, fill = Model1),
        stat = "identity",
        position = position_dodge(0.7), # Ensure same dodge as other points
        size = 3,    # Increase size for emphasis
        shape = 22,
        stroke = 1.5,
        show.legend=F
    ) +
    scale_y_continuous(labels = percent_format()) +
    geom_vline(xintercept = seq(1.5, length(unique(meta_tmp$label))), linetype="dotted") +
    labs(y = "R diff. (SE)") +
    facet_grid(Target ~ Discovery_clean, scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid() + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))

# Simplify results showing results only with or without training data
meta_tmp_simple <- meta_tmp
meta_tmp_simple$Model1[meta_tmp_simple$Model1 != 'SumStatTune'] <- 'IndivTune'
meta_tmp_simple$Model1[meta_tmp_simple$Model1 == 'SumStatTune'] <- 'SumStatTune'
meta_tmp_simple$Model2[meta_tmp_simple$Model2 != 'SumStatTune'] <- 'IndivTune'
meta_tmp_simple$Model2[meta_tmp_simple$Model2 == 'SumStatTune'] <- 'SumStatTune'
meta_tmp_simple<-meta_tmp_simple[meta_tmp_simple$Model_1 %in% res_eval_simp$Group,]
meta_tmp_simple<-meta_tmp_simple[meta_tmp_simple$Model_2 %in% res_eval_simp$Group,]

meta_tmp_ref_simple <- meta_tmp_ref
meta_tmp_ref_simple$Model1[meta_tmp_ref_simple$Model1 != 'SumStatTune'] <- 'IndivTune'
meta_tmp_ref_simple$Model1[meta_tmp_ref_simple$Model1 == 'SumStatTune'] <- 'SumStatTune'
meta_tmp_ref_simple$Model2[meta_tmp_ref_simple$Model2 != 'SumStatTune'] <- 'IndivTune'
meta_tmp_ref_simple$Model2[meta_tmp_ref_simple$Model2 == 'SumStatTune'] <- 'SumStatTune'
meta_tmp_ref_simple<-meta_tmp_ref_simple[meta_tmp_ref_simple$Model_1 %in% res_eval_simp$Group,]
meta_tmp_ref_simple<-meta_tmp_ref_simple[meta_tmp_ref_simple$Model_2 %in% res_eval_simp$Group,]

tmp_simple <- tmp
tmp_simple$Model1[tmp_simple$Model1 != 'SumStatTune'] <- 'IndivTune'
tmp_simple$Model1[tmp_simple$Model1 == 'SumStatTune'] <- 'SumStatTune'
tmp_simple$Model2[tmp_simple$Model2 != 'SumStatTune'] <- 'IndivTune'
tmp_simple$Model2[tmp_simple$Model2 == 'SumStatTune'] <- 'SumStatTune'
tmp_simple<-tmp_simple[tmp_simple$Model_1 %in% res_eval_simp$Group,]
tmp_simple<-tmp_simple[tmp_simple$Model_2 %in% res_eval_simp$Group,]

# Export plot for manuscript
png('~/oliverpainfel/Analyses/crosspop/plots/average_r.perc_improv.png', width = 3200, height = 2000, res= 300, units = 'px')
ggplot(meta_tmp_simple[meta_tmp_simple$Target != 'EUR Target',], aes(x=label, y=R_diff_perc , fill = Model1)) +
#    geom_boxplot(
#      data = tmp_simple[tmp_simple$Target != 'EUR Target',],
#        mapping = aes(x=label, y=R_diff_perc, colour=Model1),
#        position = position_dodge(0.7),
#        alpha = 0.3
#      ) +
    geom_point(
        data = tmp_simple[tmp_simple$Target != 'EUR Target',],
        mapping = aes(x=label, y=R_diff_perc, colour=Model1),
        position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.7),
        alpha = 0.3
      ) +
    geom_hline(yintercept = 0) +
      geom_errorbar(
        aes(
          ymin = R_diff_perc - R_diff_perc_SE,
          ymax = R_diff_perc + R_diff_perc_SE
        ),
        width = 0,
        position = position_dodge(width = 0.7)
      ) +    
    geom_point(
      stat = "identity",
      position = position_dodge(0.7),
      size = 3,
      shape = 23
    ) +
    geom_point(
        data = meta_tmp_ref_simple[meta_tmp_ref_simple$Target != 'EUR Target',],
        aes(x = label, y = R_diff_perc, fill = Model1),
        stat = "identity",
        position = position_dodge(0.7), # Ensure same dodge as other points
        size = 4,
        shape = 22,
        show.legend=F
    ) +
    geom_vline(xintercept = seq(1.5, length(unique(meta_tmp_simple$label))), linetype="dotted") +
    scale_y_continuous(labels = percent_format()) +
    labs(y = "Relative Improvement (SE)", fill = NULL, colour = NULL, x = NULL) +
    facet_grid(Target ~ Discovery_clean, scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(
        axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),  # Increase x-axis labels
        legend.position = "top",
        legend.key.spacing.x = unit(2, "cm"),
        legend.justification = "center"
    )
dev.off()

# Plot for EUR
meta_tmp_simple$Discovery_clean <- paste0(meta_tmp_simple$Discovery1,' GWAS')
meta_tmp_ref_simple$Discovery_clean <- paste0(meta_tmp_ref_simple$Discovery1,' GWAS')
tmp_simple$Discovery_clean <- paste0(tmp_simple$Discovery1,' GWAS')

meta_tmp_simple<-meta_tmp_simple[!duplicated(meta_tmp_simple[, c('label', 'Discovery_clean', 'Model1'), with=F]),]
meta_tmp_ref_simple<-meta_tmp_ref_simple[!duplicated(meta_tmp_ref_simple[, c('label', 'Discovery_clean', 'Model1'), with=F]),]
tmp_simple<-tmp_simple[!duplicated(tmp_simple[, c('label', 'Discovery_clean', 'Model1','pheno'), with=F]),]

png('~/oliverpainfel/Analyses/crosspop/plots/average_r_eur.perc_improv.png', width = 4000, height = 1500, res= 300, units = 'px')
ggplot(meta_tmp_simple[meta_tmp_simple$Target == 'EUR Target',], aes(x=label, y=R_diff_perc , fill = Model1)) +
    geom_point(
        data = tmp_simple[tmp_simple$Target == 'EUR Target',],
        mapping = aes(x=label, y=R_diff_perc, colour=Model1),
        position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.7),
        alpha = 0.3
      ) +
    geom_hline(yintercept = 0) +
      geom_errorbar(
        aes(
          ymin = R_diff_perc - R_diff_perc_SE,
          ymax = R_diff_perc + R_diff_perc_SE
        ),
        width = 0,
        position = position_dodge(width = 0.7)
      ) +    
    geom_point(
      stat = "identity",
      position = position_dodge(0.7),
      size = 3,
      shape = 23
    ) +
    geom_point(
        data = meta_tmp_ref_simple[meta_tmp_ref_simple$Target == 'EUR Target',],
        aes(x = label, y = R_diff_perc, fill = Model1),
        stat = "identity",
        position = position_dodge(0.7), # Ensure same dodge as other points
        size = 4,
        shape = 22,
        show.legend=F
    ) +
    geom_vline(xintercept = seq(1.5, length(unique(meta_tmp_simple$label))), linetype="dotted") +
    scale_y_continuous(labels = percent_format()) +
    labs(y = "Relative Improvement (SE)", fill = NULL, colour = NULL, x = NULL) +
    facet_grid(Target ~ Discovery_clean, scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(
        axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),  # Increase x-axis labels
        legend.position = "top",
        legend.key.spacing.x = unit(2, "cm"),
        legend.justification = "center"
    )
dev.off()

########
# Plot relative improvement of LEOPARD over IndivTune of SumStatTune scores
########

# meta res
meta_res_comp_ref <- meta_res_comp[meta_res_comp$Model2 == 'Multi-SumStatTune',]
meta_res_comp_ref <- meta_res_comp_ref[meta_res_comp_ref$Method1 != 'all' & meta_res_comp_ref$Method2 != 'all',]
meta_res_comp_ref <- meta_res_comp_ref[meta_res_comp_ref$Model1 == 'SumStatTune' & meta_res_comp_ref$Source1 == 'Multi',]
meta_res_comp_ref <- meta_res_comp_ref[gsub('_multi','', meta_res_comp_ref$Method1) == gsub('_multi','', meta_res_comp_ref$Method2),]

meta_res_comp_ref$R_diff_perc <- meta_res_comp_ref$R_diff / meta_res_comp_ref$Model_2_R
meta_res_comp_ref$R_diff_perc_SE <- meta_res_comp_ref$R_diff_SE / meta_res_comp_ref$Model_2_R

meta_res_comp_ref$Discovery_clean <- paste0(meta_res_comp_ref$Discovery1,' GWAS')
meta_res_comp_ref$Discovery_clean[meta_res_comp_ref$Target != 'EUR'] <- 'EUR + Target-matched GWAS'

meta_res_comp_ref <- merge(meta_res_comp_ref, pgs_method_labels, by.x = 'Method1', by.y = 'method', all.x = T)
meta_res_comp_ref$label[grepl('Multi', meta_res_comp_ref$Model1) & !(meta_res_comp_ref$Method1 %in% pgs_group_methods)] <- paste0(meta_res_comp_ref$label[grepl('Multi', meta_res_comp_ref$Model1) & !(meta_res_comp_ref$Method1 %in% pgs_group_methods)], '-multi')
meta_res_comp_ref$label <- factor(meta_res_comp_ref$label, levels = model_order)

meta_res_comp_ref$Target_clean <- paste0(meta_res_comp_ref$Target,' Target')

# trait-specific res
res_comp_all_ref <- res_comp_all[res_comp_all$Model2 == 'Multi-SumStatTune',]
res_comp_all_ref <- res_comp_all_ref[res_comp_all_ref$Method1 != 'all' & res_comp_all_ref$Method2 != 'all',]
res_comp_all_ref <- res_comp_all_ref[res_comp_all_ref$Model1 == 'SumStatTune' & res_comp_all_ref$Source1 == 'Multi',]
res_comp_all_ref <- res_comp_all_ref[gsub('_multi','', res_comp_all_ref$Method1) == gsub('_multi','', res_comp_all_ref$Method2),]

res_comp_all_ref$R_diff_perc <- res_comp_all_ref$R_diff / res_comp_all_ref$Model_2_R
res_comp_all_ref$R_diff_perc_SE <- res_comp_all_ref$R_diff_SE / res_comp_all_ref$Model_2_R

res_comp_all_ref$Discovery_clean <- paste0(res_comp_all_ref$Discovery1,' GWAS')
res_comp_all_ref$Discovery_clean[res_comp_all_ref$Target != 'EUR'] <- 'EUR + Target-matched GWAS'

res_comp_all_ref <- merge(res_comp_all_ref, pgs_method_labels, by.x = 'Method1', by.y = 'method', all.x = T)
res_comp_all_ref$label[grepl('Multi', res_comp_all_ref$Model1) & !(res_comp_all_ref$Method1 %in% pgs_group_methods)] <- paste0(res_comp_all_ref$label[grepl('Multi', res_comp_all_ref$Model1) & !(res_comp_all_ref$Method1 %in% pgs_group_methods)], '-multi')
res_comp_all_ref$label <- factor(res_comp_all_ref$label, levels = model_order)

res_comp_all_ref$Target_clean <- paste0(res_comp_all_ref$Target,' Target')

tmp_meta<-meta_res_comp_ref
tmp_all<-res_comp_all_ref

tmp_meta<-tmp_meta[!(tmp_meta$Method1 %in% c('prscsx','xwing')),]
tmp_meta<-tmp_meta[tmp_meta$Target != 'EUR',]

tmp_all<-tmp_all[!(tmp_all$Method1 %in% c('prscsx','xwing')),]
tmp_all<-tmp_all[tmp_all$Target != 'EUR',]

library(ggrepel)

# plot
png('~/oliverpainfel/Analyses/crosspop/plots/leopard_perc_improv.png', width = 1800, height = 1100, res= 300, units = 'px')

ggplot(tmp_meta, aes(x=label, y=R_diff_perc , fill = Model1)) +
    geom_hline(yintercept = 0) +
      geom_errorbar(
        aes(
          ymin = R_diff_perc - R_diff_perc_SE,
          ymax = R_diff_perc + R_diff_perc_SE
        ),
        width = 0,
        position = position_dodge(width = 0.7)
      ) +    
    geom_point(
      stat = "identity",
      position = position_dodge(0.7),
      size = 3,
      shape = 23
    ) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp_meta$label))), linetype="dotted") +
    scale_y_continuous(labels = percent_format()) +
    labs(y = "Relative Difference (SE)", fill = NULL, colour = NULL, x = NULL) +
    facet_grid(. ~ Target_clean) +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(
        axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),  # Increase x-axis labels
        legend.position = 'none'
    )
dev.off()

# Now compare quickPRS-multi and prs-csx only with trait
tmp_meta<-meta_res_comp_ref
tmp_all<-res_comp_all_ref

tmp_meta<- tmp_meta[tmp_meta$Target != 'EUR' & tmp_meta$Method1 %in% c('quickprs_multi','prscsx'),]
tmp_all<- tmp_all[tmp_all$Target != 'EUR' & tmp_all$Method1 %in% c('quickprs_multi','prscsx'),]

library(ggrepel)

png('~/oliverpainfel/Analyses/crosspop/plots/leopard_perc_improv_restricted.png', width = 1500, height = 1500, res= 300, units = 'px')
ggplot(tmp_meta, aes(x=label, y=R_diff_perc , fill = Model1)) +
    geom_point(
        data = tmp_all,
        mapping = aes(x=label, y=R_diff_perc, colour=Model1),
        position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.7),
        alpha = 0.3
      ) +
    geom_hline(yintercept = 0) +
      geom_errorbar(
        aes(
          ymin = R_diff_perc - R_diff_perc_SE,
          ymax = R_diff_perc + R_diff_perc_SE
        ),
        width = 0,
        position = position_dodge(width = 0.7)
      ) +    
    geom_point(
      stat = "identity",
      position = position_dodge(0.7),
      size = 3,
      shape = 23
    ) +
    scale_y_continuous(labels = percent_format()) +
    labs(y = "Relative Improvement (SE)", fill = NULL, colour = NULL, x = NULL) +
    facet_grid(. ~ Target_clean) +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(
        axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),  # Increase x-axis labels
        legend.position = 'none'
    ) +
    geom_text_repel(
      data = tmp_all[
        tmp_all$R_diff_perc < -0.25,
      ],
      aes(label = pheno),  # label as percent with 1 decimal
      position = position_dodge(width = 0.7),
      size = 3,
      min.segment.length = 0,
      segment.color = NA,
      show.legend = FALSE
    )
dev.off()

# It shows PRS-CSx --meta flag is actually does very well, except when the AFR GWAS is very small (~2700).

Show average improvement in AFR + EAS

Show average improvement in EUR

Show LEOPARD comparison

LEOPARD+QuickPRS

Here we will compare the LEOPARD estimated weights for population specific PGS, to the weights estimated using observed data in the UKB target sample.

Show code

setwd('/users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline/')

library(data.table)
library(ggplot2)
library(cowplot)

source('../functions/misc.R')
source_all('../functions')

# Read in list of outcomes 
selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', header=F)$V1

# Get some key variables from config
config<-'/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config.yaml'
outdir <- read_param(config = config, param = 'outdir', return_obj = F)

# Get a list of score files
scores <- list_score_files(config)

###
# Read in weights estimated by LEOPARD (QuickPRS)
###

leopard_weights<-NULL
scores_quickprs <- scores$name[scores$method == 'quickprs_multi']
for(i in selected_traits){
  scores_i <- scores_quickprs[grepl(paste0('^', i,'_'), scores_quickprs)]
  for(j in scores_i){
      weights_file <- readRDS(paste0(outdir, '/reference/pgs_score_files/leopard/', j, '/ref-', j, '.weights.rds'))
      weights_file <- data.frame(weights_file)
      
      weights <-
        data.table(
          Target = do.call(c, lapply(names(weights_file), function(x) rep(x, 2))),
          Discovery = names(weights_file),
          Weight = do.call(c, lapply(weights_file, function(x) x)),
          Trait = i,
          Method = 'LEOPARD'
        )
      
      leopard_weights <- rbind(leopard_weights, weights)
  }
}

#####
# Read in the PGS weights estimated using UKB data
#####
# Read in the final model coefficients for multi-source methods

obs_weights<-NULL
for(method_i in unique(scores$method)[!(unique(scores$method) %in% pgs_group_methods)]){
  scores_method<-scores$name[scores$method == method_i]
  method_i <- gsub('_multi','', method_i)

  for(i in selected_traits){
    for(j in c('EAS','AFR','EUR')){
      if(j == 'EUR'){
        pops <- c('EAS','AFR')
      } else {
        pops <- j
      }
      
      for(k in pops){
        model <- fread(paste0('~/oliverpainfel/Analyses/crosspop/targ_', j, '.disc_EUR_', k, '/', i, '/final_models/', method_i, '.pseudo.multi.final_model.txt'))
        model<-model[-1,]
        
        # Set weight to zero if negative, as this is what LEOPARD does
        if(any(model$V2 < 0)){
          model$V2[model$V2 < 0] <- 0
          model$V2[model$V2 > 0] <- 1
        }
        
        names(model) <- c('x', 'BETA')
        model$Discovery[grepl('UKB', model$x)]<-'EUR'
        model$Discovery[grepl('BBJ', model$x)]<-'EAS'
        model$Discovery[grepl('UGR', model$x)]<-'AFR'
        model$Target <- j
        model$Weight <- model$BETA/sum(model$BETA)
        model$Trait <- i
        model$Method <- method_i
        model<-model[,c('Target','Discovery','Weight','Method','Trait'), with=F]
        obs_weights<-rbind(obs_weights, model)
      }
    }
  }
}

####
## Estimate weights if using the inverse variance weighting (realised this doesn't make sense as PGS are standardised whereas SNP effects in PRS-CSx are not)
####
#
## Read in GWAS descriptives
#gwas_desc<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/gwas_descriptives.csv')
#gwas_desc <- gwas_desc[, c('Trait Label','Ancestry','GWAS N'), with=F]
#names(gwas_desc)<-c('trait','ancestry','n')
#gwas_desc<-gwas_desc[gwas_desc$trait %in% selected_traits,]
#
#library(dplyr)
#library(tidyr)
#
## Reshape GWAS table to wide format
#wide_gwas <- gwas_desc %>%
#  pivot_wider(names_from = ancestry, values_from = n, values_fill = 0)
#
## Function to create rows for each pair
#make_weights_long <- wide_gwas %>%
#  rowwise() %>%
#  do({
#    trait <- .$trait
#    eur <- .$EUR
#    afr <- .$AFR
#    eas <- .$EAS
#    
#    tibble(
#      Trait = trait,
#      Method = "inverse_var",
#      Target = c("AFR", "AFR", "EUR", "EUR", "EUR", "EAS", "EAS"),
#      Discovery = c("EUR", "AFR", "EUR", "AFR", "EAS", "EUR", "EAS"),
#      Weight = c(
#        eur / (eur + afr), afr / (eur + afr),  # AFR target
#        eur / (eur + afr), afr / (eur + afr),  # EUR target (vs AFR)
#        eas / (eur + eas),                     # EUR target (vs EAS)
#        eur / (eur + eas), eas / (eur + eas)   # EAS target (vs EUR)
#      )
#    )
#  }) %>%
#  bind_rows()

###
# Combine and compare
###

#both <- do.call(rbind, list(obs_weights, leopard_weights, make_weights_long))
both <- do.call(rbind, list(obs_weights, leopard_weights))

# Remove ptclump as it doesn't have a sumstattune method
both <- both[both$Method != 'ptclump',]

both<-merge(both, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x=T, sort = F)
both$label[is.na(both$label)] <- both$Method[is.na(both$label)]
both$label <- factor(both$label, levels=unique(both$label))

# Plot non-EUR target first
tmp <- both[both$Target != 'EUR',]
tmp <- tmp[tmp$Discovery != 'EUR',]

# Set LEOPARD to black fill
default_colors <- hue_pal()(10)
names(default_colors) <- levels(tmp$label)
default_colors["LEOPARD"] <- "black"

# Plot the estimated and observed weights
png('~/oliverpainfel/Analyses/crosspop/plots/leopard_weights.png', units = 'px', res = 300, width = 2500, height = 1500)
ggplot(tmp, aes(x = Trait, y = Weight, fill = label)) +
  geom_bar(width= 0.7, position=position_dodge(0.7), stat="identity", colour = 'black', size = 0.1) +
  scale_fill_manual(values = default_colors) +
  facet_grid(Target ~ .) +
  theme_half_open() +
  labs(title = 'Weight of target ancestry-matched PGS', fill = NULL) +
  background_grid(major = 'y', minor = 'y') + 
  panel_border() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ylim(c(0,1))
dev.off()

# Plot EUR target
tmp <- both[both$Target == 'EUR',]
tmp <- tmp[tmp$Discovery != 'EUR',]

# Set LEOPARD to black fill
default_colors <- hue_pal()(10)
names(default_colors) <- levels(tmp$label)
default_colors["LEOPARD"] <- "black"

# Plot the estimated and observed weights
png('~/oliverpainfel/Analyses/crosspop/plots/leopard_weights_eur.png', units = 'px', res = 300, width = 2500, height = 1500)
ggplot(tmp, aes(x = Trait, y = Weight, fill = label)) +
  geom_bar(width= 0.7, position=position_dodge(0.7), stat="identity", colour = 'black', size = 0.1) +
  scale_fill_manual(values = default_colors) +
  facet_grid(Discovery ~ .) +
  theme_half_open() +
  labs(title = 'Weight of non-EUR PGS for EUR Target', fill = NULL) +
  background_grid(major = 'y', minor = 'y') + 
  panel_border() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ylim(c(0,1))
dev.off()

###
# Check calibration of LEOPARD compared to QuickPRS observed weights
###

tmp <- both[both$Target != 'EUR',]
tmp$Target<-NULL
tmp_wide <- reshape(tmp, 
                     idvar = c("Trait", "Discovery"), 
                     timevar = "label", 
                     direction = "wide")

names(tmp_wide) <- gsub('Weight.', '', names(tmp_wide))
tmp_wide<-tmp_wide[, !(grepl('Method', names(tmp_wide))), with = F]

tmp_wide_eas <- tmp_wide[tmp_wide$Discovery == 'EAS',]
tmp_wide_afr <- tmp_wide[tmp_wide$Discovery == 'AFR',]

# Calculate metrics
rmse_afr <- sqrt(mean((tmp_wide_afr$QuickPRS - tmp_wide_afr$LEOPARD)^2))
me_afr <- mean(tmp_wide_afr$QuickPRS - tmp_wide_afr$LEOPARD)

rmse_eas <- sqrt(mean((tmp_wide_eas$QuickPRS - tmp_wide_eas$LEOPARD)^2))
me_eas <- mean(tmp_wide_eas$QuickPRS - tmp_wide_eas$LEOPARD)

# Create annotation data.frame
metrics_df <- data.frame(
  Discovery = c("AFR", "EAS"),
  x = c(0.5, 0.5),         # Adjust positions as needed
  y = c(-0.05, -0.05),
  label = c(
    paste0("RMSE = ", round(rmse_afr, 2), "\nME = ", round(me_afr, 2)),
    paste0("RMSE = ", round(rmse_eas, 2), "\nME = ", round(me_eas, 2))
  )
)

png('~/oliverpainfel/Analyses/crosspop/plots/leopard_weights_calibration.png', units = 'px', width = 2000, height = 2000, res = 300)
ggplot(tmp_wide[tmp_wide$Discovery != 'EUR',], aes(x = LEOPARD, y = QuickPRS)) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "grey40") +  # Perfect calibration
  geom_smooth(method = "lm", se = TRUE, colour = "blue") +  # Regression line
  geom_point(alpha = 0.7) +
  geom_text(data = metrics_df, aes(x = x, y = y, label = label), inherit.aes = FALSE, hjust = 0, size = 3.5) +
  labs(
    x = "LEOPARD weight",
    y = "Observed weight",
  ) +
  theme_half_open() +
  panel_border() + 
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
  ) +
  facet_grid(. ~ Discovery) +
  coord_fixed()    
dev.off()

####
## Check calibration of inverse_var compared to QuickPRS observed weights (again realised this doesn't make sense)
####
#
#tmp <- both[both$Target != 'EUR',]
#tmp$Target<-NULL
#tmp_wide <- reshape(tmp, 
#                     idvar = c("Trait", "Discovery"), 
#                     timevar = "label", 
#                     direction = "wide")
#
#names(tmp_wide) <- gsub('Weight.', '', names(tmp_wide))
#tmp_wide<-tmp_wide[, !(grepl('Method', names(tmp_wide))), with = F]
#
#tmp_wide_eas <- tmp_wide[tmp_wide$Discovery == 'EAS',]
#tmp_wide_afr <- tmp_wide[tmp_wide$Discovery == 'AFR',]
#
## Calculate metrics
#rmse_afr <- sqrt(mean((tmp_wide_afr$QuickPRS - tmp_wide_afr$inverse_var)^2))
#me_afr <- mean(tmp_wide_afr$QuickPRS - tmp_wide_afr$inverse_var)
#
#rmse_eas <- sqrt(mean((tmp_wide_eas$QuickPRS - tmp_wide_eas$inverse_var)^2))
#me_eas <- mean(tmp_wide_eas$QuickPRS - tmp_wide_eas$inverse_var)
#
## Create annotation data.frame
#metrics_df <- data.frame(
#  Discovery = c("AFR", "EAS"),
#  x = c(0.5, 0.5),         # Adjust positions as needed
#  y = c(-0.05, -0.05),
#  label = c(
#    paste0("RMSE = ", round(rmse_afr, 2), "\nME = ", round(me_afr, 2)),
#    paste0("RMSE = ", round(rmse_eas, 2), "\nME = ", round(me_eas, 2))
#  )
#)
#
#png('~/oliverpainfel/Analyses/crosspop/plots/inverse_var_weights_calibration.png', units = 'px', width = 2000, height = 2000, res = 300)
#ggplot(tmp_wide[tmp_wide$Discovery != 'EUR',], aes(x = inverse_var, y = QuickPRS)) +
#  geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "grey40") +  # Perfect calibration
#  geom_smooth(method = "lm", se = TRUE, colour = "blue") +  # Regression line
#  geom_point(alpha = 0.7) +
#  geom_text(data = metrics_df, aes(x = x, y = y, label = label), inherit.aes = FALSE, hjust = 1.5, size = 3.5) +
#  labs(
#    x = "inverse_var weight",
#    y = "Observed weight",
#  ) +
#  theme_half_open() +
#  panel_border() + 
#  theme(
#    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
#  ) +
#  facet_grid(. ~ Discovery) +
#  coord_fixed()    
#dev.off()

###
# Check calibration of observed weights across all methods
###

tmp <- both[both$Target != 'EUR',]
tmp <- tmp[!(tmp$label %in% c('LEOPARD','inverse_var')),]
tmp$Target<-NULL
tmp_wide <- reshape(tmp, 
                     idvar = c("Trait", "Discovery"), 
                     timevar = "label", 
                     direction = "wide")

names(tmp_wide) <- gsub('Weight.', '', names(tmp_wide))
tmp_wide<-tmp_wide[, !(grepl('Method', names(tmp_wide))), with = F]

tmp_wide <- tmp_wide[tmp_wide$Discovery %in% c('EAS','AFR'),]

metrics <- NULL
for(i in c('EAS','AFR')){
  for(j in unique(tmp$label)){
    for(k in unique(tmp$label)){
      tmp_wide_i <- tmp_wide[tmp_wide$Discovery == i,]
      rmse <- sqrt(mean((tmp_wide_i[[j]] - tmp_wide_i[[k]])^2))
      me <- mean(tmp_wide_i[[j]] - tmp_wide_i[[k]])
      
      metrics <- rbind(
        metrics,
        data.frame(
          Population = i,
          Method1 = j,
          Method2 = k,
          rmse = rmse,
          me = me
        )
      )
    }
  }
}

png('~/oliverpainfel/Analyses/crosspop/plots/observed_weights_calibration.png', units = 'px', width = 3000, height = 1650, res = 300)
ggplot(metrics, aes(x = Method1, y = Method2, fill = rmse)) +
  geom_tile(color = "white") +  # Tile plot with white borders
  geom_text(aes(label = round(rmse, 2)), color = "black") +  # Add correlation values
  scale_fill_gradient2(mid = "white", high = "red", midpoint = 0) +  # Color scale
  theme_half_open() +
  panel_border() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title = element_blank()
  ) +
  facet_grid(. ~ Population) +
  labs(fill = "RMSE")
dev.off()

# Calculate average RMSE for each method against all other methods
metrics_unique <- metrics[metrics$Method1 != metrics$Method2, ]
metrics_unique$Comparison <- NA
for (i in 1:nrow(metrics_unique)) {
  metrics_unique$Comparison[i] <-
    paste0(sort(c(
      metrics_unique$Method1[i], metrics_unique$Method2[i]
    )), collapse = ' vs. ')
}
metrics_unique <- metrics_unique[!duplicated(paste0(metrics_unique$Population, metrics_unique$Comparison)),]

mean_rmse <- NULL
for(i in unique(tmp$label)){
  for(j in c('AFR','EAS')){
    metrics_unique_tmp <- metrics_unique[metrics_unique$Method1 == i | metrics_unique$Method2 == i,]
    metrics_unique_tmp <- metrics_unique_tmp[metrics_unique_tmp$Population == j,]
    mean_rmse <- rbind(
      mean_rmse, 
      data.frame(
        Method = i,
        Population = j,
        avg_rmse = mean(metrics_unique_tmp$rmse)
      )
    )
  }
}

png('~/oliverpainfel/Analyses/crosspop/plots/avg_observed_weight_rmse.png', units = 'px', width = 1500, height = 1500, res = 300)
ggplot(mean_rmse, aes(x = Method, y = avg_rmse, fill = Method)) +
  geom_bar(width= 0.7, position=position_dodge(0.7), stat="identity", size = 0.1) +
  geom_text(aes(label = round(avg_rmse, 3)),  # <-- Add this
          vjust = 1.5,                    # <-- Move the text slightly above the bar
          size = 3) +                      # <-- Adjust text size
  scale_fill_manual(values = default_colors) +
  facet_grid(Population ~ .) +
  theme_half_open() +
  background_grid(major = 'y', minor = 'y') + 
  panel_border() +
  labs(y = 'Average RMSE') +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position="none")
dev.off()

####
## Check calibration of estimated (LEOPARD and inverse_var) weights compared to observed QuickPRS weights
####
#
#tmp <- both[both$Target != 'EUR',]
#tmp <- tmp[(tmp$label %in% c('LEOPARD','inverse_var','QuickPRS')),]
#tmp$Target<-NULL
#tmp_wide <- reshape(tmp, 
#                     idvar = c("Trait", "Discovery"), 
#                     timevar = "label", 
#                     direction = "wide")
#
#names(tmp_wide) <- gsub('Weight.', '', names(tmp_wide))
#tmp_wide<-tmp_wide[, !(grepl('Method', names(tmp_wide))), with = F]
#
#tmp_wide <- tmp_wide[tmp_wide$Discovery %in% c('EAS','AFR'),]
#
#metrics <- NULL
#for(i in c('EAS','AFR')){
#  for(j in unique(tmp$label)){
#    for(k in unique(tmp$label)){
#      tmp_wide_i <- tmp_wide[tmp_wide$Discovery == i,]
#      rmse <- sqrt(mean((tmp_wide_i[[j]] - tmp_wide_i[[k]])^2))
#      me <- mean(tmp_wide_i[[j]] - tmp_wide_i[[k]])
#      
#      metrics <- rbind(
#        metrics,
#        data.frame(
#          Population = i,
#          Method1 = j,
#          Method2 = k,
#          rmse = rmse,
#          me = me
#        )
#      )
#    }
#  }
#}
#
## Plot the rmse for LEOPARD and inverse_var predicting observed QuickPRS weight
#metrics <- metrics[metrics$Method1 == 'QuickPRS',]
#metrics <- metrics[metrics$Method2 != 'QuickPRS',]
#
#png('~/oliverpainfel/Analyses/crosspop/plots/inverse_var_comp_rmse.png', units = 'px', width = 800, height = 1500, res = 300)
#ggplot(metrics, aes(x = Method2, y = rmse, fill = Method2)) +
#  geom_bar(width= 0.7, position=position_dodge(0.7), stat="identity", size = 0.1) +
#  geom_text(aes(label = round(rmse, 3)),  # <-- Add this
#          vjust = 1.5,                    # <-- Move the text slightly above the bar
#          size = 3) +                      # <-- Adjust text size
#  facet_grid(Population ~ .) +
#  theme_half_open() +
#  background_grid(major = 'y', minor = 'y') + 
#  panel_border() +
#  labs(y = 'RMSE relative to QuickPRS', x = 'Method') +
#  theme(axis.text.x = element_text(angle = 45, hjust = 1),
#        legend.position="none")
#dev.off()

Show observed and LEOPARD PGS weights

Computational resoures

Here we will read in the benchmark data for PGS methods and create a table for the manuscript.

Show code

library(data.table)
library(ggplot2)
library(cowplot)

setwd('~/oliverpainfel/Software/MyGit/GenoPred/pipeline/')
source('../functions/misc.R')
source_all('../functions')

# Get some key variables from config
config<-'/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config.yaml'
pgs_methods <- read_param(config = config, param = 'pgs_methods', return_obj = F)
outdir <- read_param(config = config, param = 'outdir', return_obj = F)

# Read in configuration specific benchmark files
bm_files_i <- list.files(paste0(outdir, '/reference/benchmarks/'), full.names = T)

# Subset benchmarks for pgs_methods
bm_files_i <- bm_files_i[grepl('prep_pgs_|leopard_quickprs_', bm_files_i)]

# Subset to benchmarks for gwas/gwas_groups in config
scores <- list_score_files(config)
bm_files_i <- bm_files_i[grepl(paste0('-', unique(scores$name),'.txt', collapse = '|'), bm_files_i)]

# Read in benchmark files
bm_dat_all <- do.call(rbind, lapply(bm_files_i, function(file) {
  tmp <- fread(file)
  tmp$file <- basename(file)
  return(tmp)
}))

# Create rule column
bm_dat_all$rule <- gsub('-.*','',bm_dat_all$file)

# Create method column
bm_dat_all$method <-
  gsub('_i', '', gsub('prep_pgs_', '', bm_dat_all$rule))

bm_dat_all <- merge(bm_dat_all, pgs_method_labels, by = 'method', all.x=T)

bm_dat_all$label[bm_dat_all$method == 'leopard_quickprs']<-"LEOPARD (QuickPRS)"

#############
# Time
#############

# Calculate average time taken for each method
method_avg <- NULL
for(i in unique(bm_dat_all$label)){
  method_avg <- rbind(
    method_avg,
    data.frame(
      method = bm_dat_all$method[bm_dat_all$label == i][1],
      Method = i,
      Time = mean(bm_dat_all$s[bm_dat_all$label == i])
    )
  )
}

# Times X-Wing time by two since it used 20 cores, but other methods used 10
method_avg$Time[method_avg$method == 'xwing'] <- method_avg$Time[method_avg$method == 'xwing'] * 2

# Divide the multi-source methods (PRS-CSx and X-Wing by 2 so it is time per GWAS)
method_avg$Time[method_avg$method %in% c('prscsx','xwing','leopard_quickprs')] <- method_avg$Time[ method_avg$method %in% c('prscsx','xwing','leopard_quickprs')] / 2

# Approximate times for either tuning or grid only
method_avg$Model <- 'Full'

tmp <- method_avg[method_avg$method == 'prscs' & method_avg$Model == 'Full',] 
tmp$Model <- 'auto'
tmp$Time <- tmp$Time * (1/5)
method_avg<-rbind(method_avg, tmp)

tmp <- method_avg[method_avg$method == 'prscsx' & method_avg$Model == 'Full',] 
tmp$Model <- 'auto'
tmp$Time <- tmp$Time * (1/5)
method_avg<-rbind(method_avg, tmp)

tmp <- method_avg[method_avg$method == 'xwing' & method_avg$Model == 'Full',] 
tmp$Model <- 'grid'
tmp$Time <- tmp$Time * (2/10)
method_avg<-rbind(method_avg, tmp)

# Format the time taken nicely
method_avg$Time_clean[method_avg$Time < 60] <-
  paste0(round(method_avg$Time[method_avg$Time < 60], 1), ' sec')
method_avg$Time_clean[method_avg$Time > 60] <-
  paste0(round(method_avg$Time[method_avg$Time > 60] / 60, 1), ' min')
method_avg$Time_clean[method_avg$Time > 3600] <-
  paste0(round(method_avg$Time[method_avg$Time > 3600] / 60 / 60, 1), ' hr')

# Convert time in seconds to hours
method_avg$Time_hour <- method_avg$Time / 60/60

# Seperate methods by single or multi source
method_avg$Type[!(method_avg$method %in% pgs_group_methods)]<-'Single-source'
method_avg$Type[method_avg$method %in% pgs_group_methods]<-'Multi-source'
method_avg$Type[method_avg$method == 'leopard_quickprs']<-'Tuning'

method_avg$Type<-factor(method_avg$Type, levels = c('Single-source','Multi-source','Tuning'))
method_avg$Method <- factor(method_avg$Method, levels = c("DBSLMM", "lassosum", "LDpred2", "MegaPRS", "PRS-CS", "pT+clump", "QuickPRS", "SBayesRC", "QuickPRS-Multi", "PRS-CSx", "X-Wing","LEOPARD (QuickPRS)"))

ggplot(method_avg, aes(x = Method, y = Time_hour, fill = Model)) +
  geom_bar(stat = "identity", position="dodge") +
  geom_text(aes(label = Time_clean), vjust = 0.5, angle = 90, hjust = -0.2, position = position_dodge(width = 0.9)) +
  labs(x = NULL, y = "Time (hours)") +
  ylim(0, max(method_avg$Time_hour) + (max(method_avg$Time_hour)/5)) +
  facet_grid(~ Type, scales='free', space = 'free_x') +
  theme_half_open() +
  background_grid(major = 'y', minor = 'y') + 
  panel_border() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

method_avg <- method_avg[method_avg$Model == 'Full',]
method_avg <- method_avg[, c('Method','Time_hour')]
method_avg$Time_hour <- round(method_avg$Time_hour, 2)
names(method_avg)<-c('Method',"Time (hrs)")

#############
# Memory
#############

# Calculate average max_rss for each method
method_avg_mem <- NULL
for(i in unique(bm_dat_all$label)){
  method_avg_mem <- rbind(
    method_avg_mem,
    data.frame(
      method = bm_dat_all$method[bm_dat_all$label == i][1],
      Method = i,
      Memory = mean(bm_dat_all$max_rss[bm_dat_all$label == i])
    )
  )
}

# Divide X-Wing memory by two, since it used 20 cores, but other methods used 10
method_avg_mem$Memory[method_avg_mem$method == 'xwing'] /2

# Format the Memory nicely
method_avg_mem$Memory_clean <-
  paste0(round(method_avg_mem$Memory/1000, 2), ' Gb')

ggplot(method_avg_mem, aes(x = Method, y = Memory, fill = Method)) +
  geom_bar(stat = "identity", position="dodge") +
  geom_text(aes(label = Memory_clean), vjust = -0.5, position = position_dodge(width = 0.9)) +
  labs(x = "PGS Method", y = "Memory (Mb)") +
  theme_half_open() +
  background_grid() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position="none")

method_avg_mem$Memory_gb <- method_avg_mem$Memory/1000
method_avg_mem <- method_avg_mem[, c('Method','Memory_gb')]
method_avg_mem$Memory_gb <- round(method_avg_mem$Memory_gb, 2)
names(method_avg_mem)<-c('Method',"Memory (Gb)")

method_avg<-merge(method_avg, method_avg_mem, by = 'Method')

write.csv(method_avg, '~/oliverpainfel/Analyses/crosspop/time_memory.csv', row.names=F)

Show computational resources table

Method	Time (hrs)	Memory (Gb)
DBSLMM	0.15	1.15
lassosum	0.08	7.30
LDpred2	0.38	20.54
LEOPARD (QuickPRS)	0.23	8.03
MegaPRS	0.72	11.92
PRS-CS	4.40	10.57
PRS-CSx	6.84	15.18
pT+clump	0.02	0.83
QuickPRS	0.06	4.41
SBayesRC	0.38	4.40
X-Wing	34.12	48.70

TL-PRS

Run using AFR and EAS subset in UKB to make it quicker to run. This is the main interest when running TL-PRS anyway.

Subset AFR and EAS individuals in UKB data

To make this quicker, focus on evaluating the PGS methods in the AFR and EAS target individuals in UKB. This will avoid reprocessing the full UKB data.

Show code

library(data.table)

keep <- NULL
for (i in c('AFR', 'EAS')) {
  keep <- rbind(keep, fread(
    paste0(
      '~/oliverpainfel/Data/ukb/GenoPred/output/ukb/ancestry/keep_files/model_based/',
      i,
      '.keep'
    )
  ))
}

write.table(
  keep,
  '~/oliverpainfel/Data/ukb/afr_eas.keep',
  row.names = F,
  col.names = F,
  quote = F
)

mkdir ~/oliverpainfel/Data/ukb/afr_eas_subset

for chr in $(seq 1 22); do
  ~/oliverpainfel/Software/plink2 \
    --pfile ~/oliverpainfel/Data/ukb/GenoPred/output/ukb/geno/ukb.ref.chr${chr} \
    --keep ~/oliverpainfel/Data/ukb/afr_eas.keep \
    --make-pgen \
    --out ~/oliverpainfel/Data/ukb/afr_eas_subset/ukb.chr${chr}
done

PGS calculation

To save time, run using PGS methods that do not need pre-processed LD matrix data (ptclump, dbslmm, megaprs, lassosum). If the results vary from the 1KG+HGDP results, then expand to other methods (LDpred2, SBayesRC, QuickPRS).

Show code

Prepare configuration

library(data.table)

dir.create('/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/tlprs')

######
# target_list
######
target_list <- data.frame(
  name='ukb',
  path='/users/k1806347/oliverpainfel/Data/ukb/afr_eas_subset/ukb',
  type='plink2',
  indiv_report=F,
  unrel='/users/k1806347/oliverpainfel/Data/ukb/phenotypes/unrelated.row_number.txt'
)

write.table(target_list, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/eas_afr_only/target_list.txt', col.names=T, row.names=F, quote=F)

######
# config
######

config<-c(
  "outdir: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/output_tlprs",
  "config_file: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/tlprs/config.yaml",
  "gwas_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list.txt",
  "target_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/eas_afr_only/target_list.txt",
  "gwas_groups: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_groups.txt",
  "pgs_methods: ['quickprs','dbslmm','ldpred2','sbayesrc']",
  "tlprs_methods: ['quickprs','dbslmm','ldpred2','sbayesrc']",
  "cores_prep_pgs: 10",
  "cores_target_pgs: 50",
  "prscs_phi: ['auto']",
  "ldpred2_model: ['auto']",
  "ldpred2_inference: F",
  "dbslmm_h2f: ['1']",
  "ldpred2_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/ldpred2/hm3",
  "quickprs_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/quickprs/hm3",
  "sbayesrc_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/sbayesrc/hm3"
)

write.table(config, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/tlprs/config.yaml', col.names = F, row.names = F, quote = F)

Run pipeline

snakemake \
  --profile slurm \
  --use-conda \
  --configfile=/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/tlprs/config.yaml \
  target_pgs -n

PGS evaluation

Show code

Create predictor list

setwd('~/oliverpainfel/Software/MyGit/GenoPred/pipeline/')
source('../functions/misc.R')
source_all('../functions')
library(data.table)

# Get some key variables from config
config<-'/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/tlprs/config.yaml'
outdir <- read_param(config = config, param = 'outdir', return_obj = F)

# Read in list of outcomes 
selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', header=F)$V1

# Get a list of score files
scores <- list_score_files(config)

# Create files for EAS and AFR targets
targ_pop <- c('EAS','AFR')
for(trait_i in selected_traits){
  scores_i <- scores[grepl(trait_i, scores$name),]

  for(targ_pop_i in targ_pop){
    # Subset GWAS based on EUR and/or targ_pop_i
    if(targ_pop_i == 'EAS'){
      disc_pop <- 'BBJ'
    }
    if(targ_pop_i == 'AFR'){
      disc_pop <- 'UGR'
    }
    if(targ_pop_i == 'EUR'){
      disc_pop <- c('BBJ','UGR')
    }
    
    for(disc_pop_j in disc_pop){
      if(disc_pop_j == 'BBJ'){
        disc_pop_j_2 <- 'EAS'
      }
      if(disc_pop_j == 'UGR'){
        disc_pop_j_2 <- 'AFR'
      }

      dir.create(
        paste0(
          '/users/k1806347/oliverpainfel/Analyses/crosspop/tlprs/targ_',
          targ_pop_i,
          '.disc_EUR_',
          disc_pop_j_2,
          '/',
          trait_i
        ),
        recursive = T
      )
      
      scores_i_j <- scores_i[
        (grepl('UKB$', scores_i$name, ignore.case = F) | 
         grepl(paste0(disc_pop_j, '$'), scores_i$name, ignore.case = T)),]

      scores_i_j$predictor <- paste0(
        outdir,
        '/ukb/pgs/TRANS/',
        scores_i_j$method,
        '/',
        scores_i_j$name,
        '/ukb-',
        scores_i_j$name,
        '-TRANS.profiles'
      )
      
      #####
      # List single-source PGS
      #####
      # These are actually pseudoval scores (as per the config)
      scores_i_j_single <- scores_i_j[!grepl('tlprs', scores_i_j$method),]
      
      scores_i_j_single$top1[grepl('UKB', scores_i_j_single$name, ignore.case = F)] <- 'EUR'
      scores_i_j_single$top1[grepl(disc_pop_j, scores_i_j_single$name, ignore.case = F)] <- disc_pop_j_2
      scores_i_j_single$multi <- paste0(scores_i_j_single$method,'.pseudo')
      
      #####
      # List tlprs scores (split by target population)
      #####
      scores_i_j_tlprs <- scores_i_j[grepl('tlprs', scores_i_j$method),]
      scores_i_j_tlprs$multi <- scores_i_j_tlprs$method
      
      scores_i_j_tlprs_pop<-NULL
      for(i in 1:nrow(scores_i_j_tlprs)){
        score_header<-fread(scores_i_j_tlprs$predictor[i], nrow = 1)
        
        for(pop in c('EUR', disc_pop_j_2)){
          score_cols <- which(grepl(paste0('^FID$|^IID$|_targ_', pop, '_'), names(score_header)))

          system(
            paste0(
              "cut -d' ' -f ", 
              paste0(score_cols, collapse=','),
              " ", 
              scores_i_j_tlprs$predictor[i], 
              " > ", 
              gsub('.profiles',
                   paste0('.targ_', pop, '.profiles'),
                   scores_i_j_tlprs$predictor[i])
            )
          )
          
          tmp <- scores_i_j_tlprs[i,]
          tmp$multi <- paste0(tmp$multi, '.pop')
          tmp$top1 <- pop
          tmp$predictor <-
              gsub('.profiles',
                   paste0('.targ_', pop, '.profiles'),
                   scores_i_j_tlprs$predictor[i])
          
          scores_i_j_tlprs_pop <- rbind(scores_i_j_tlprs_pop, tmp)
        }
      }

      predictors_i<- do.call(rbind, list(
        scores_i_j_single, scores_i_j_tlprs_pop
      ))
      
      predictors_i <- predictors_i[, c('predictor', 'top1','multi'), with=F]
      
      write.table(
        predictors_i,
        paste0(
          '/users/k1806347/oliverpainfel/Analyses/crosspop/tlprs/targ_',
          targ_pop_i,
          '.disc_EUR_',
          disc_pop_j_2,
          '/',
          trait_i,
          '/predictor_list.tlprs.txt'
        ),
        col.names = T,
        row.names = F,
        quote = F
      )
    }
  }
}

Run model_builder

cd /users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline
conda activate model_builder

#rm /users/k1806347/oliverpainfel/Analyses/crosspop/tlprs/targ_*.disc_EUR_*/*/res*

for targ_pop in $(echo EAS AFR); do
  if [ "$targ_pop" == "EUR" ]; then
      targ_pop2="EUR_test"
  else
      targ_pop2=$targ_pop
  fi
  
  if [ "$targ_pop" == "EUR" ]; then
    disc_pop=$(echo AFR EAS)
  fi
  
  if [ "$targ_pop" == "EAS" ]; then
    disc_pop="EAS"
  fi
  
  if [ "$targ_pop" == "AFR" ]; then
    disc_pop="AFR"
  fi
  
  for disc_pop_i in ${disc_pop}; do
    for pheno in $(cat /users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt); do
      if [ ! -f "/users/k1806347/oliverpainfel/Analyses/crosspop/tlprs/targ_${targ_pop}.disc_EUR_${disc_pop_i}/${pheno}/res.tlprs.pred_comp.txt" ]; then
        sbatch --mem 10G -n 5 -p neurohack_cpu,interruptible_cpu -t 1:00:00 --wrap="Rscript ../Scripts/model_builder/model_builder_top1.R \
          --outcome /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/${pheno}.unrel.${targ_pop2}.row_number.txt \
          --predictors /users/k1806347/oliverpainfel/Analyses/crosspop/tlprs/targ_${targ_pop}.disc_EUR_${disc_pop_i}/${pheno}/predictor_list.tlprs.txt \
          --out /users/k1806347/oliverpainfel/Analyses/crosspop/tlprs/targ_${targ_pop}.disc_EUR_${disc_pop_i}/${pheno}/res.tlprs \
          --n_core 5"
      fi
    done
  done
done

Plot results

setwd('/users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline/')

library(data.table)
library(ggplot2)
library(cowplot)

source('../functions/misc.R')
source_all('../functions')

# Read in list of outcomes 
selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', header=F)$V1
info_all <- fread('~/oliverpainfel/Analyses/crosspop/gwas_descriptives.csv')

# Calculate correlation between all phenotypes in each target population
cors <- list()
for(pop_i in c('EUR','EAS','AFR','CSA','AMR')){
  if(pop_i == 'EUR'){
    pop_i_2 <- 'EUR_test'
  } else {
    pop_i_2 <- pop_i
  }
  pheno_pop_i <- list()
  for(pheno_i in selected_traits){
    pheno_pop_i[[pheno_i]] <- fread(paste0('/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/', pheno_i, '.unrel.', pop_i_2, '.row_number.txt'))
    names(pheno_pop_i[[pheno_i]])[3] <- pheno_i
  }
  
  pheno_pop_i_merged <- merged_df <- Reduce(function(x, y) merge(x, y, all = TRUE, by = c('FID','IID')), pheno_pop_i)

  cors_i <- abs(cor(as.matrix(pheno_pop_i_merged[,-1:-2, with=F]), use='p'))
  cors[[pop_i]] <- cors_i
}

# Read in results
targ_pop = c('EAS','AFR')
res_eval <- list()
for(pheno_i in selected_traits){
  res_eval_i<-NULL
  for(targ_pop_i in targ_pop){
    if(targ_pop_i == 'EAS'){
      disc_pop <- 'EAS'
    }
    if(targ_pop_i == 'AFR'){
      disc_pop <- 'AFR'
    }
    if(targ_pop_i == 'EUR'){
      disc_pop <- c('EAS','AFR')
    }
    for(disc_pop_i in disc_pop){
      eval_i <-
        fread(
          paste0(
            '/users/k1806347/oliverpainfel/Analyses/crosspop/tlprs/',
            'targ_',
            targ_pop_i,
            '.disc_EUR_',
            disc_pop_i,
            '/',
            pheno_i,
            '/res.tlprs.pred_eval.txt'
          )
        )
      eval_i$Target<-targ_pop_i
      eval_i$gwas_group<-paste0('EUR+', disc_pop_i)
      res_eval_i<-rbind(res_eval_i, eval_i)
    }
  }
  
  res_eval_i$Method<-sub('\\..*','',res_eval_i$Group)
  res_eval_i$Method<-gsub('-.*','', res_eval_i$Method)
  
  res_eval_i$Model[grepl('top1$', res_eval_i$Group) &
                   !grepl('pseudo', res_eval_i$Group)]<-'IndivTune'
  res_eval_i$Model[grepl('top1$', res_eval_i$Group) &
                   grepl('pseudo', res_eval_i$Group)]<-'SumStatTune'
  res_eval_i$Model[grepl('multi$', res_eval_i$Group) &
                   !grepl('pseudo', res_eval_i$Group)]<-'Multi-IndivTune'
  res_eval_i$Model[grepl('multi$', res_eval_i$Group) &
                   grepl('pseudo', res_eval_i$Group)]<-'Multi-SumStatTune'
  
  res_eval_i$Model[grepl('_multi', res_eval_i$Group)]<-'SumStatTune'
  res_eval_i$Model[res_eval_i$Group == 'prscsx.pseudo.multi']<-'SumStatTune'
  res_eval_i$Model[res_eval_i$Group == 'xwing.pseudo.multi']<-'SumStatTune'
  
  res_eval_i$Source<-ifelse(
    res_eval_i$Method %in% pgs_group_methods | grepl('_multi$', res_eval_i$Method) | 
    !grepl('EUR|EAS|AFR', res_eval_i$Group), 'Multi', 'Single')
  
  res_eval_i$Discovery[grepl('EUR', res_eval_i$Group)] <- 'EUR'
  res_eval_i$Discovery[grepl('EAS', res_eval_i$Group)] <- 'EAS'
  res_eval_i$Discovery[grepl('AFR', res_eval_i$Group)] <- 'AFR'
  res_eval_i$Discovery[res_eval_i$Source == 'Multi'] <- res_eval_i$gwas_group[res_eval_i$Source == 'Multi']
  
  res_eval_i$Method<-factor(res_eval_i$Method, levels=unique(res_eval_i$Method))
  res_eval_i$Model<-factor(res_eval_i$Model, levels=c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
  res_eval_i$Discovery<-factor(res_eval_i$Discovery, levels=c('AFR','EAS','EUR','EUR+AFR','EUR+EAS'))

  # Remove IndivTune and Multi-IndivTune model for groups that contain one score (aka QuickPRS and SBayesRC)
  res_eval_i <- res_eval_i[
    !(res_eval_i$Method %in%  c('quickprs','sbayesrc') & 
      res_eval_i$Model %in% c('IndivTune','Multi-IndivTune')),]
  
  # Remove pseudo model for methods that don't really have one 
  res_eval_i <- res_eval_i[
    !(res_eval_i$Method %in%  c('ptclump','ptclump_multi') & 
      res_eval_i$Model %in% c('SumStatTune','Multi-SumStatTune')),]

  # Remove top1 models for *-Multi, PRS-CSx, X-wing, TL-*
  res_eval_i <- res_eval_i[
    !((res_eval_i$Method %in%  c('prscsx', 'xwing') | grepl('_multi$', res_eval_i$Method)) & 
      grepl('top1', res_eval_i$Group)),]
  
  # Remove any duplicate models
  res_eval_i <- res_eval_i[!duplicated(res_eval_i[, c(
    "Target", "Method", "Model", "Source", "Discovery","gwas_group"
  )]),]
  
  res_eval[[pheno_i]]<-res_eval_i
  
}

# Create vector defining or of methods in plots
model_order <- c("DBSLMM", "lassosum", "LDpred2", "MegaPRS", "PRS-CS", "pT+clump", "QuickPRS", "SBayesRC", "DBSLMM-multi", "lassosum-multi", "LDpred2-multi", "MegaPRS-multi", "PRS-CS-multi", "pT+clump-multi", "QuickPRS-multi", "SBayesRC-multi","TL-DBSLMM","TL-LDpred2","TL-QuickPRS","TL-SBayesRC", "PRS-CSx", "X-Wing", "All") 

####
# Average results across phenotypes
####

library(MAd)

# Average R across phenotypes
meta_res_eval <- NULL
for(targ_pop_i in targ_pop){
  if(targ_pop_i == 'EAS'){
    disc_pop <- 'EAS'
  }
  if(targ_pop_i == 'AFR'){
    disc_pop <- 'AFR'
  }
  if(targ_pop_i == 'EUR'){
    disc_pop <- c('EAS','AFR')
  }
  for(disc_pop_i in disc_pop){
  
    # Subset res_eval for each scenario
    res_eval_i <- do.call(rbind, lapply(seq_along(res_eval), function(i) {
      x <- res_eval[[i]]
      x$pheno <- names(res_eval)[i]
      x <- x[x$Target == targ_pop_i]
      x <- x[x$gwas_group == paste0('EUR+', disc_pop_i)]
    }))
    
    # Average res_evalults for each test across phenotypes
    # Use MAd to account for correlation between them
    res_eval_i$Sample<-'A'
  
    for(group_i in unique(res_eval_i$Group)){
      res_eval_group_i <- res_eval_i[res_eval_i$Group == group_i,]
      missing_pheno <-
        colnames(cors[[targ_pop_i]])[!(colnames(cors[[targ_pop_i]]) %in% unique(res_eval_group_i$pheno))]
      
      if (!all(colnames(cors[[targ_pop_i]]) %in% unique(res_eval_group_i$pheno))) {
        print(paste0(
          'res_evalults missing for ',
          targ_pop_i,
          ' ',
          group_i,
          ' ',
          paste0(missing_pheno, collapse = ' ')
        ))
      }
      
      cors_i <- cors[[targ_pop_i]][unique(res_eval_group_i$pheno), unique(res_eval_group_i$pheno)]
      
      meta_res_eval_i <-
        agg(
          id = Sample,
          es = R,
          var = SE ^ 2,
          cor = cors_i,
          method = "BHHR",
          mod = NULL,
          data = res_eval_group_i
        )
      
      tmp <- data.table(Group = group_i,
                        Method = res_eval_group_i$Method[1],
                        Model = res_eval_group_i$Model[1],
                        Source = res_eval_group_i$Source[1],
                        Discovery = res_eval_group_i$Discovery[1],
                        gwas_group = res_eval_group_i$gwas_group[1],
                        Target = targ_pop_i,
                        R = meta_res_eval_i$es,
                        SE = sqrt(meta_res_eval_i$var))
      
      meta_res_eval <- rbind(meta_res_eval, tmp)
    }
  }
}

meta_res_eval$Model<-factor(meta_res_eval$Model, levels=c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
meta_res_eval$Discovery<-factor(meta_res_eval$Discovery, levels=c('AFR','EAS','EUR','EUR+AFR','EUR+EAS'))

# Plot average performance across phenotypes for AFR and EAS targets
tmp <- meta_res_eval
tmp <- tmp[tmp$Target != 'EUR',]
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All' & !grepl('^tlprs_', tmp$Method)] <- paste0(tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All' & !grepl('^tlprs_', tmp$Method)], '-multi')
tmp$label <- factor(tmp$label, levels = model_order)
tmp$Discovery_clean <- as.character(tmp$Discovery)
tmp$Discovery_clean[tmp$Discovery == 'EUR'] <- 'EUR GWAS'
tmp$Discovery_clean[tmp$Discovery != 'EUR' & tmp$Source == 'Single'] <- 'Target-matched GWAS'
tmp$Discovery_clean[tmp$Discovery != 'EUR' & tmp$Source == 'Multi'] <- 'Both'
tmp$Discovery_clean <- factor(tmp$Discovery_clean, 
                              levels = c('Target-matched GWAS',
                                         'EUR GWAS',
                                         'Both'))
tmp$Target <- paste0(tmp$Target, ' Target')
tmp$Model[tmp$Model != 'SumStatTune'] <- 'IndivTune'
tmp$Model[tmp$Model == 'SumStatTune'] <- 'SumStatTune'
tmp <- tmp[!duplicated(tmp[, c('label','Target','Discovery_clean','Model'), with=F]),]

dir.create('~/oliverpainfel/Analyses/crosspop/tlprs/plots')

# Plot unidirectional TL-PRS (as it was intended), comparing the unadjusted EUR PGS to the EUR PGS that has been adjusted according to the target-matched GWAS
tmp_tlprs_uni <- tmp[grepl('tlprs', tmp$Method) & !grepl('pop-EUR.top1', tmp$Group) & tmp$Source == 'Single', ]
tmp_tlprs_uni$Type <- 'TL-PRS'
tmp_unadj <- tmp[!grepl('tlprs', tmp$Method) & tmp$Discovery == 'EUR', ]
tmp_unadj$Type <- 'Original'
tmp_both <- rbind(tmp_unadj, tmp_tlprs_uni)
tmp_both$label<-gsub('TL-','',tmp_both$label)
tmp_both$Type<-factor(tmp_both$Type, levels = c('Original','TL-PRS'))

png(paste0('~/oliverpainfel/Analyses/crosspop/tlprs/plots/unidirectional_r.png'), res=300, width = 2000, height = 1600, units = 'px')
ggplot(tmp_both, aes(x=label, y=R , fill = Type)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x='Method') +
    facet_grid(Target ~ ., scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
          legend.position = "top",
          legend.key.spacing.x = unit(1, "cm"),
          legend.justification = "center")
dev.off()

tmp_tlprs_uni <- tmp[grepl('tlprs', tmp$Method) & grepl('pop-EUR.top1', tmp$Group) & tmp$Source == 'Single', ]
tmp_tlprs_uni$Type <- 'TL-PRS'
tmp_unadj <- tmp[!grepl('tlprs', tmp$Method) & tmp$Discovery == 'EUR', ]
tmp_unadj$Type <- 'Original'
tmp_both <- rbind(tmp_unadj, tmp_tlprs_uni)
tmp_both$label<-gsub('TL-','',tmp_both$label)
tmp_both$Type<-factor(tmp_both$Type, levels = c('Original','TL-PRS'))

png(paste0('~/oliverpainfel/Analyses/crosspop/tlprs/plots/unidirectional_r.targ_EUR.png'), res=300, width = 2000, height = 1600, units = 'px')
ggplot(tmp_both, aes(x=label, y=R , fill = Type)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x='Method') +
    facet_grid(Target ~ ., scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
          legend.position = "top",
          legend.key.spacing.x = unit(1, "cm"),
          legend.justification = "center")
dev.off()

# Have one column per method, but fill according to Original EUR PGS, Original matched-PGS, TL-PRS EUR Target, TL-PRS non-EUR Target, TL-PRS Multi, and Original-Multi
tmp <- meta_res_eval
tmp <- tmp[tmp$Target != 'EUR',]
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x = T)
tmp$Type <- NA
tmp$Type[grepl('tlprs', tmp$Method) & grepl('pop-EUR.top1', tmp$Group)]<-"TL-PRS (EUR PGS tuned to target)"
tmp$Type[grepl('tlprs', tmp$Method) & !grepl('pop-EUR.top1', tmp$Group) & tmp$Source == 'Single']<-"TL-PRS (Target-matched PGS tuned to EUR)"
tmp$Type[!grepl('tlprs', tmp$Method) & tmp$Discovery == 'EUR']<-"Original (EUR PGS)"
tmp$Type[!grepl('tlprs', tmp$Method) & tmp$Discovery != 'EUR' & tmp$Source == 'Single']<-"Original (Target-matched PGS)"
tmp$Type[grepl('tlprs', tmp$Method) & grepl('multi', tmp$Group)]<-"TL-PRS-multi"
tmp$Type[!grepl('tlprs', tmp$Method) & grepl('multi', tmp$Group)]<-"Original-multi"
tmp <- tmp[!is.na(tmp$Type),]
tmp$Type<-factor(tmp$Type, levels=c("Original (EUR PGS)", "Original (Target-matched PGS)", "TL-PRS (EUR PGS tuned to target)", "TL-PRS (Target-matched PGS tuned to EUR)", "Original-multi", "TL-PRS-multi"))
tmp$label<-gsub('TL-','',tmp$label)

png(paste0('~/oliverpainfel/Analyses/crosspop/tlprs/plots/average_r.png'), res=300, width = 4000, height = 2200, units = 'px')
ggplot(tmp, aes(x=label, y=R , fill = Type)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x='Method') +
    facet_grid(Target ~ ., scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
dev.off()

#########################
# Check significance of differences between TL-PRS and unadjusted approaches
########################

####
# Create heatmap showing difference between all methods and models
####

# Create a function to mirror pred_comp results
mirror_comp<-function(x){
  x_sym <- x
  x_sym$Model_1 <- x$Model_2
  x_sym$Model_2 <- x$Model_1
  x_sym$Model_1_R <- x$Model_2_R
  x_sym$Model_2_R <- x$Model_1_R
  x_sym$R_diff <- -x_sym$R_diff
  x_mirrored <- rbind(x, x_sym)
  x_diag<-data.frame(
      Model_1=unique(x_mirrored$Model_1),
      Model_2=unique(x_mirrored$Model_1),
      Model_1_R=x_mirrored$Model_1_R,
      Model_2_R=x_mirrored$Model_1_R,
      R_diff=NA,
      R_diff_pval=NA
    )
  x_comp<-rbind(x_mirrored, x_diag)
  return(x_comp)
}
  
# Read in results
targ_pop=c('EAS','AFR')
res_comp <- list()
for(pheno_i in selected_traits){
  res_comp_i<-NULL
  for(targ_pop_i in targ_pop){
    if(targ_pop_i == 'EAS'){
      disc_pop <- 'EAS'
    }
    if(targ_pop_i == 'AFR'){
      disc_pop <- 'AFR'
    }
    if(targ_pop_i == 'EUR'){
      disc_pop <- c('EAS','AFR')
    }
    for(disc_pop_i in disc_pop){
      comp_i <-
        fread(
          paste0(
            '/users/k1806347/oliverpainfel/Analyses/crosspop/tlprs/',
            'targ_',
            targ_pop_i,
            '.disc_EUR_',
            disc_pop_i,
            '/',
            pheno_i,
            '/res.tlprs.pred_comp.txt'
          )
        )
      comp_i<-mirror_comp(comp_i)
      comp_i$Target<-targ_pop_i
      comp_i$gwas_group<-paste0('EUR+', disc_pop_i)
      res_comp_i<-rbind(res_comp_i, comp_i)
    }
  }
  
  res_comp[[pheno_i]]<-res_comp_i
}

res_comp_all <- do.call(rbind, lapply(names(res_comp), function(name) {
  x <- res_comp[[name]]
  x$pheno <- name  # Add a new column with the name of the element
  x  # Return the updated dataframe
}))

# Annotate tests to get order correct
res_comp_all$Method1<-sub('\\..*','',res_comp_all$Model_1)
res_comp_all$Method1<-gsub('-.*','', res_comp_all$Method1)
res_comp_all$Method2<-sub('\\..*','',res_comp_all$Model_2)
res_comp_all$Method2<-gsub('-.*','', res_comp_all$Method2)

find_model<-function(x){
  mod <- x
  mod[grepl('top1$', x) & !grepl('pseudo', x)] <- 'IndivTune'
  mod[grepl('top1$', x) & grepl('pseudo', x)] <- 'SumStatTune'
  mod[grepl('multi$', x) & !grepl('pseudo', x)] <- 'Multi-IndivTune'
  mod[grepl('multi$', x) & grepl('pseudo', x)] <- 'Multi-SumStatTune'
  mod[grepl('_multi', x)] <- 'SumStatTune'
  mod[x == 'prscsx.pseudo.multi'] <- 'SumStatTune'
  mod[x == 'xwing.pseudo.multi'] <- 'SumStatTune'
  
  return(mod)
}

res_comp_all$Model1<-find_model(res_comp_all$Model_1)
res_comp_all$Model2<-find_model(res_comp_all$Model_2)

res_comp_all$Source1<-ifelse(res_comp_all$Method1 %in% pgs_group_methods | grepl('_multi$', res_comp_all$Method1) | !grepl('AFR|EAS|EUR', res_comp_all$Model_1), 'Multi', 'Single')
res_comp_all$Source2<-ifelse(res_comp_all$Method2 %in% pgs_group_methods | grepl('_multi$', res_comp_all$Method2) | !grepl('AFR|EAS|EUR', res_comp_all$Model_2), 'Multi', 'Single')
  
for(i in c('EUR','EAS','AFR')){
  res_comp_all$Discovery1[grepl(i, res_comp_all$Model_1)] <- i
  res_comp_all$Discovery2[grepl(i, res_comp_all$Model_2)] <- i
}
res_comp_all$Discovery1[res_comp_all$Source1 == 'Multi'] <- res_comp_all$gwas_group[res_comp_all$Source1 == 'Multi']
res_comp_all$Discovery2[res_comp_all$Source2 == 'Multi'] <- res_comp_all$gwas_group[res_comp_all$Source2 == 'Multi']

res_comp_all$Method1<-factor(res_comp_all$Method1, levels=unique(res_comp_all$Method1))
res_comp_all$Method2<-factor(res_comp_all$Method2, levels=unique(res_comp_all$Method2))
res_comp_all$Model1<-factor(res_comp_all$Model1, levels=c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
res_comp_all$Model2<-factor(res_comp_all$Model2, levels=c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
res_comp_all$Discovery1<-factor(res_comp_all$Discovery1, levels=rev(c('AFR','EAS','EUR','EUR+AFR','EUR+EAS')))
res_comp_all$Discovery2<-factor(res_comp_all$Discovery2, levels=c('AFR','EAS','EUR','EUR+AFR','EUR+EAS'))

# Remove IndivTune and Multi-IndivTune model for groups that contain one score (aka QuickPRS and SBayesRC)
res_comp_all <- res_comp_all[
!(res_comp_all$Method1 %in%  c('quickprs','sbayesrc') & 
  res_comp_all$Model1 %in% c('IndivTune','Multi-IndivTune')),]
res_comp_all <- res_comp_all[
!(res_comp_all$Method2 %in%  c('quickprs','sbayesrc') & 
  res_comp_all$Model2 %in% c('IndivTune','Multi-IndivTune')),]

# Remove pseudo model for methods that don't really have one 
res_comp_all <- res_comp_all[
!(res_comp_all$Method1 %in%  c('ptclump','ptclump_multi') & 
  res_comp_all$Model1 %in% c('SumStatTune','Multi-SumStatTune')),]
res_comp_all <- res_comp_all[
!(res_comp_all$Method2 %in%  c('ptclump','ptclump_multi') & 
  res_comp_all$Model2 %in% c('SumStatTune','Multi-SumStatTune')),]

# Remove top1 models for PRS-CSx
res_comp_all <- res_comp_all[
!(grepl('prscsx|xwing|_multi', res_comp_all$Method1) & 
  grepl('top1', res_comp_all$Model_1)),]
res_comp_all <- res_comp_all[
!(grepl('prscsx|xwing|_multi', res_comp_all$Method2) & 
  grepl('top1', res_comp_all$Model_2)),]

# Remove any comparisons
res_comp_all <- res_comp_all[!duplicated(res_comp_all[, c("Target", "Method1", "Model1", "Source1", "Discovery1", "Method2", "Model2", "Source2", "Discovery2",'pheno')]),]

###########

library(MAd)

# Average R across phenotypes
meta_res_comp <- NULL
for(targ_pop_i in targ_pop){
  if(targ_pop_i == 'EAS'){
    disc_pop <- 'EAS'
  }
  if(targ_pop_i == 'AFR'){
    disc_pop <- 'AFR'
  }
  if(targ_pop_i == 'EUR'){
    disc_pop <- c('EAS','AFR')
  }
  for(disc_pop_i in disc_pop){
  
    # Subset res_comp for each scenario
    res_comp_i <- res_comp_all[res_comp_all$Target == targ_pop_i & res_comp_all$gwas_group == paste0('EUR+', disc_pop_i)]
  
    # Calculate diff SE based on p-value
    res_comp_i$R_diff_pval[res_comp_i$R_diff == 0] <- 1-0.001
    res_comp_i$R_diff_pval[res_comp_i$R_diff_pval == 1]<-1-0.001
    res_comp_i$R_diff_z<-qnorm(res_comp_i$R_diff_pval/2)
    res_comp_i$R_diff_SE<-abs(res_comp_i$R_diff/res_comp_i$R_diff_z)
        
    # Average results for each test across phenotypes
    # Use MAd to account for correlation between them
    res_comp_i$Sample<-'A'
    res_comp_i$Group <- paste0(res_comp_i$Model_1, '_vs_', res_comp_i$Model_2)
  
    for(group_i in unique(res_comp_i$Group)){
      res_comp_group_i <- res_comp_i[res_comp_i$Group == group_i,]
      cors_i <- cors[[targ_pop_i]][unique(res_comp_group_i$pheno), unique(res_comp_group_i$pheno)]
      
      if(res_comp_group_i$Model_1[1] != res_comp_group_i$Model_2[1]){
        
        meta_res_comp_i <-
          agg(
            id = Sample,
            es = R_diff,
            var = R_diff_SE ^ 2,
            cor = cors_i,
            method = "BHHR",
            mod = NULL,
            data = res_comp_group_i
          )
        
        tmp <- res_comp_group_i[1,]
        tmp$pheno <- NULL
        tmp$Model_1_R <-
          meta_res_eval$R[meta_res_eval$Group == tmp$Model_1 &
                            meta_res_eval$Target == targ_pop_i &
                            meta_res_eval$gwas_group == paste0('EUR+', disc_pop_i)]
        tmp$Model_2_R <-
          meta_res_eval$R[meta_res_eval$Group == tmp$Model_2 &
                            meta_res_eval$Target == targ_pop_i &
                            meta_res_eval$gwas_group == paste0('EUR+', disc_pop_i)]
        tmp$R_diff <- meta_res_comp_i$es
        tmp$R_diff_SE <- sqrt(meta_res_comp_i$var)
        tmp$R_diff_z <- tmp$R_diff / tmp$R_diff_SE
        tmp$R_diff_p <- 2*pnorm(-abs(tmp$R_diff_z))
      } else {
        tmp <- res_comp_group_i[1,]
        tmp$pheno <- NULL
        tmp$R_diff <- NA
        tmp$R_diff_SE <- NA
        tmp$R_diff_z <- NA
        tmp$R_diff_p <- NA
      }
      meta_res_comp <- rbind(meta_res_comp, tmp)
    }
  }
}

meta_res_comp$R_diff_perc <- meta_res_comp$R_diff / meta_res_comp$Model_2_R
  
# Compare IndivTune SBayesRC-multi to TL-SBayesRC-multi
tmp_sbayesrc <- meta_res_comp[meta_res_comp$Model_2 == 'sbayesrc.pseudo.multi' & 
                                meta_res_comp$Model_1 == 'tlprs_sbayesrc.pop.multi' &
                    meta_res_comp$Target == 'AFR',]
round(min(tmp_sbayesrc$R_diff_perc)*100, 1)
tmp_sbayesrc$R_diff_p

tmp_sbayesrc <- meta_res_comp[meta_res_comp$Model_2 == 'sbayesrc.pseudo.multi' & 
                                meta_res_comp$Model_1 == 'tlprs_sbayesrc.pop.multi' &
                    meta_res_comp$Target == 'EAS',]
round(min(tmp_sbayesrc$R_diff_perc)*100, 1)
tmp_sbayesrc$R_diff_p

Show TLPRS results

Computational resoures

Show code

library(data.table)
library(ggplot2)
library(cowplot)

setwd('~/oliverpainfel/Software/MyGit/GenoPred/pipeline/')
source('../functions/misc.R')
source_all('../functions')

# Get some key variables from config
config<-'/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/tlprs/config.yaml'
pgs_methods <- read_param(config = config, param = 'pgs_methods', return_obj = F)
outdir <- read_param(config = config, param = 'outdir', return_obj = F)

# Read in configuration specific benchmark files
bm_files_i <- list.files(paste0(outdir, '/reference/benchmarks/'), full.names = T)

# Subset benchmarks for pgs_methods
bm_files_i <- bm_files_i[grepl('prep_pgs_tlprs', bm_files_i)]

# Read in benchmark files
bm_dat_all <- do.call(rbind, lapply(bm_files_i, function(file) {
  tmp <- fread(file)
  tmp$file <- basename(file)
  return(tmp)
}))

# Create rule column
bm_dat_all$rule <- gsub('-.*','',bm_dat_all$file)

# Create method column
bm_dat_all$method <-
  gsub('_i', '', gsub('prep_pgs_', '', bm_dat_all$rule))

#############
# Time
#############

# Calculate average time taken for each method
method_avg <- NULL
for(i in unique(bm_dat_all$method)){
  method_avg <- rbind(
    method_avg,
    data.frame(
      Method = i,
      Time = mean(bm_dat_all$s[bm_dat_all$method == i])
    )
  )
}

# Convert time in seconds to hours
method_avg$Time_hour <- method_avg$Time / 60/60
method_avg$Time_hour <- round(method_avg$Time_hour, 2)

#This is for bidirectional TL-PRS

#############
# Memory
#############

# Calculate average max_rss for each method
method_avg_mem <- NULL
for(i in unique(bm_dat_all$method)){
  method_avg_mem <- rbind(
    method_avg_mem,
    data.frame(
      Method = i,
      Memory = mean(bm_dat_all$max_rss[bm_dat_all$method == i])
    )
  )
}

# Format the Memory nicely
method_avg_mem$Memory_clean <-
  paste0(round(method_avg_mem$Memory/1000, 2), ' Gb')

ggplot(method_avg_mem, aes(x = Method, y = Memory, fill = Method)) +
  geom_bar(stat = "identity", position="dodge") +
  geom_text(aes(label = Memory_clean), vjust = -0.5, position = position_dodge(width = 0.9)) +
  labs(x = "PGS Method", y = "Memory (Mb)") +
  theme_half_open() +
  background_grid() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position="none")

method_avg_mem$Memory_gb <- method_avg_mem$Memory/1000
method_avg_mem <- method_avg_mem[, c('Method','Memory_gb')]
method_avg_mem$Memory_gb <- round(method_avg_mem$Memory_gb, 2)
names(method_avg_mem)<-c('Method',"Memory (Gb)")

method_avg<-merge(method_avg, method_avg_mem, by = 'Method')

write.csv(method_avg, '~/oliverpainfel/Analyses/crosspop/time_memory_tlprs.csv', row.names=F)

Show computational resources table

Method	Time	Time_hour	Memory (Gb)
tlprs	2614.237	0.73	31.38

Sensitivity analyses

Using 1KG reference

PRS-CS, PRS-CSx and X-Wing all use the 1KG reference sample, whereas the other methods are using the 1KG+HGDP reference sample. We should check whether this difference is impacting our conclusions.

To make this quicker, focus on evaluating the PGS methods in the AFR and EAS target individuals in UKB. This will avoid reprocessing the full UKB data.

Create 1KG only GenoPred reference data

Subset the 1KG+HGDP reference data to include only 1KG individuals.

Show code

mkdir -p ~/oliverpainfel/Data/1kg/genopred/
cp -r ~/oliverpainfel/Data/hgdp_1kg/genopred/ref ~/oliverpainfel/Data/1kg/genopred/
rm ~/oliverpainfel/Data/1kg/genopred/ref/ref.chr*.p*

library(data.table)

ref<- fread('/users/k1806347/oliverpainfel/Data/hgdp_1kg/genopred/ref/ref.chr1.psam')
ref<-ref[ref$Project == 'gnomAD_1kG',]

write.table(ref[,1, drop = F], '~/oliverpainfel/Data/1kg/1kg.keep', col.names=F, row.names=F, quote=F)

for chr in $(seq 1 22); do
  ~/oliverpainfel/Software/plink2 \
    --pfile ~/oliverpainfel/Data/hgdp_1kg/genopred/ref/ref.chr${chr} \
    --keep ~/oliverpainfel/Data/1kg/1kg.keep \
    --make-pgen \
    --out ~/oliverpainfel/Data/1kg/genopred/ref/ref.chr${chr}
done

PGS calculation

Show code

Prepare configuration

library(data.table)

dir.create('/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/eas_afr_only')

######
# config
######

config<-c(
  "outdir: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/output_1kgref",
  "refdir: /users/k1806347/oliverpainfel/Data/1kg/genopred/ref",
  "resdir: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/resdir_1kgref",
  "config_file: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/eas_afr_only/config.yaml",
  "gwas_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list.txt",
  "target_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/eas_afr_only/target_list.txt",
  "gwas_groups: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_groups.txt",
  "pgs_methods: ['ptclump','dbslmm','lassosum','megaprs']",
#  "leopard_methods: ['ptclump','dbslmm','lassosum','megaprs']",
  "cores_prep_pgs: 10",
  "cores_target_pgs: 10"
)

write.table(config, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/eas_afr_only/config.yaml', col.names = F, row.names = F, quote = F)

Run pipeline

snakemake \
  --profile slurm \
  --use-conda \
  --configfile=/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/eas_afr_only/config.yaml \
  target_pgs -n

PGS evaluation

Show code

Create predictor list

setwd('~/oliverpainfel/Software/MyGit/GenoPred/pipeline/')
source('../functions/misc.R')
source_all('../functions')
library(data.table)

# Get some key variables from config
config<-'/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/eas_afr_only/config.yaml'
pgs_methods <- read_param(config = config, param = 'pgs_methods', return_obj = F)
outdir <- read_param(config = config, param = 'outdir', return_obj = F)

# Read in list of outcomes 
selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', header=F)$V1

# Get a list of score files
scores <- list_score_files(config)

# Create files for EAS and AFR targets
targ_pop <- c('EAS','AFR')
for(trait_i in selected_traits){
  scores_i <- scores[grepl(trait_i, scores$name),]
  scores_i$multi <- scores_i$method
  
  for(targ_pop_i in targ_pop){
    if(targ_pop_i == 'EAS'){
      disc_pop <- 'BBJ'
    }
    if(targ_pop_i == 'AFR'){
      disc_pop <- 'UGR'
    }
    if(targ_pop_i == 'EUR'){
      disc_pop <- c('BBJ','UGR')
    }
    
    for(disc_pop_j in disc_pop){
      if(disc_pop_j == 'BBJ'){
        disc_pop_j_2 <- 'EAS'
      }
      if(disc_pop_j == 'UGR'){
        disc_pop_j_2 <- 'AFR'
      }

      dir.create(
        paste0(
          '/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_1kgref/targ_',
          targ_pop_i,
          '.disc_EUR_',
          disc_pop_j_2,
          '/',
          trait_i
        ),
        recursive = T
      )
      
      scores_i_j <- scores_i[
        (grepl('UKB$', scores_i$name, ignore.case = F) | 
         grepl(paste0(disc_pop_j, '$'), scores_i$name, ignore.case = T)),]

      # Insert path to score file
      scores_i_j$predictor <- paste0(
        outdir,
        '/ukb/pgs/TRANS/',
        scores_i_j$method,
        '/',
        scores_i_j$name,
        '/ukb-',
        scores_i_j$name,
        '-TRANS.profiles'
      )
      
      ####
      # Make groups single source methods
      ####
      
      scores_i_j_single_top1 <-
        scores_i_j[!(scores_i_j$method %in% pgs_group_methods) &
                     !grepl('_multi$', scores_i_j$method), ]

      # Create top1 column indicating which predictors top1 models should be derived
      scores_i_j_single_top1$top1[grepl('UKB', scores_i_j_single_top1$name, ignore.case = F)] <- 'EUR'
      scores_i_j_single_top1$top1[grepl(disc_pop_j, scores_i_j_single_top1$name, ignore.case = F)] <- disc_pop_j_2
      
      ####
      # Make groups containing pseudo scores for single source methods
      ####

      # Extract the pseudo score for each method and specify as a separate group
      for(i in 1:nrow(scores_i_j_single_top1)) {
        param <- find_pseudo(
          config = config,
          gwas = scores_i_j_single_top1$name[i],
          pgs_method = scores_i_j_single_top1$method[i],
          target_pop = targ_pop_i
        )
        
        score_header <-
          fread(scores_i_j_single_top1$predictor[i], nrows = 1)
        score_cols <-
          which(names(score_header) %in% c('FID', 'IID', paste0(scores_i_j_single_top1$name[i], '_', param)))
        
        system(
          paste0(
            "cut -d' ' -f ", 
            paste0(score_cols, collapse=','),
            " ", 
            scores_i_j_single_top1$predictor[i], 
            " > ", 
            gsub('.profiles',
                 paste0('.', targ_pop_i, '_pseudo.profiles'),
                 scores_i_j_single_top1$predictor[i])
          )
        )
      }
      
      scores_i_j_single_pseudo <- scores_i_j_single_top1
      scores_i_j_single_pseudo$multi <- paste0(scores_i_j_single_pseudo$multi, '.pseudo')

      scores_i_j_single_pseudo$predictor <- gsub('.profiles', 
                                    paste0('.', targ_pop_i, '_pseudo.profiles'),
                                    scores_i_j_single_pseudo$predictor)

#      ####
#      # Make groups for multi-single-source pseudo scores
#      ####
#      
#      scores_i_j_multi_single_pseudo <- scores_i_j[grepl('_multi$', scores_i_j$method),]
#
#      # Extract the pseudo score for each method and specify as a separate group
#      for(i in 1:nrow(scores_i_j_multi_single_pseudo)) {
#        param <- find_pseudo(
#          config = config,
#          gwas = scores_i_j_multi_single_pseudo$name[i],
#          pgs_method = scores_i_j_multi_single_pseudo$method[i],
#          target_pop = targ_pop_i
#        )
#        
#        score_header <-
#          fread(scores_i_j_multi_single_pseudo$predictor[i], nrows = 1)
#        score_cols <-
#          which(names(score_header) %in% c('FID', 'IID', paste0(scores_i_j_multi_single_pseudo$name[i], '_', param)))
#        
#        system(
#          paste0(
#            "cut -d' ' -f ", 
#            paste0(score_cols, collapse=','),
#            " ", 
#            scores_i_j_multi_single_pseudo$predictor[i], 
#            " > ", 
#            gsub('.profiles',
#                 paste0('.', targ_pop_i, '_pseudo.profiles'),
#                 scores_i_j_multi_single_pseudo$predictor[i])
#          )
#        )
#      }
#      
#      scores_i_j_multi_single_pseudo$multi <- paste0(scores_i_j_multi_single_pseudo$multi, '.pseudo')
#
#      scores_i_j_multi_single_pseudo$predictor <- gsub('.profiles', 
#                                    paste0('.', targ_pop_i, '_pseudo.profiles'),
#                                    scores_i_j_multi_single_pseudo$predictor)
#      
#      scores_i_j_multi_single_pseudo$top1<-paste0('EUR_', disc_pop_j_2)
#
#      ####
#      # Make groups for the Multi-Source methods
#      ####
#      
#      scores_i_j_multi <- scores_i_j[(scores_i_j$method %in% pgs_group_methods),]
#
#      # Split top1 scores by target population
#      # This doesn't apply to xwing because it only has pop-specific pseudo scores
#      scores_i_j_multi_top1<-NULL
#      for(i in 1:which(scores_i_j_multi$method %in% c('prscsx'))){
#        score_header<-fread(scores_i_j_multi$predictor[i], nrow = 1)
#        
#        for(pop in c('EUR', disc_pop_j_2)){
#          
#          if(scores_i_j_multi$method[i] == 'prscsx'){
#            score_cols <-
#              which(grepl(paste0('^FID$|^IID$|_', pop, '_phi'), names(score_header)))
#          }
#          if(scores_i_j_multi$method[i] == 'xwing'){
#            score_cols <-
#              which(grepl(paste0('^FID$|^IID$|_targ_', pop, '_pst'), names(score_header)))
#          }
#          
#          system(
#            paste0(
#              "cut -d' ' -f ", 
#              paste0(score_cols, collapse=','),
#              " ", 
#              scores_i_j_multi$predictor[i], 
#              " > ", 
#              gsub('.profiles',
#                   paste0('.', pop, '_grid.profiles'),
#                   scores_i_j_multi$predictor[i])
#            )
#          )
#          
#          tmp <- scores_i_j_multi[i,]
#          tmp$multi <- paste0(tmp$multi, '.grid')
#          tmp$top1 <- pop
#          tmp$predictor <-
#              gsub('.profiles',
#                   paste0('.', pop, '_grid.profiles'),
#                   scores_i_j_multi$predictor[i])
#          
#          scores_i_j_multi_top1 <- rbind(scores_i_j_multi_top1, tmp)
#        }
#      }
#
#      # Split pop-specific pseudo scores by target population
#      scores_i_j_multi_pop_pseudo<-NULL
#      for(i in 1:nrow(scores_i_j_multi)){
#        score_header<-fread(scores_i_j_multi$predictor[i], nrow = 1)
#        
#        for(pop in c('EUR', disc_pop_j_2)){
#          if(scores_i_j_multi$method[i] == 'prscsx'){
#            score_cols <-
#              which(grepl(paste0('^FID$|^IID$|_', pop, '_phi_auto'), names(score_header)))
#          }
#          if(scores_i_j_multi$method[i] == 'xwing'){
#            score_cols <-
#              which(grepl(paste0('^FID$|^IID$|_targ_', pop, '_pst_', pop), names(score_header)))
#          }
#          
#          system(
#            paste0(
#              "cut -d' ' -f ", 
#              paste0(score_cols, collapse=','),
#              " ", 
#              scores_i_j_multi$predictor[i], 
#              " > ", 
#              gsub('.profiles',
#                   paste0('.', pop, '_pseudo.profiles'),
#                   scores_i_j_multi$predictor[i])
#            )
#          )
#          
#          tmp <- scores_i_j_multi[i,]
#          tmp$multi <- paste0(tmp$multi, '.pop_pseudo')
#          tmp$top1 <- pop
#          tmp$predictor <-
#              gsub('.profiles',
#                   paste0('.', pop, '_pseudo.profiles'),
#                   scores_i_j_multi$predictor[i])
#          
#          scores_i_j_multi_pop_pseudo <- rbind(scores_i_j_multi_pop_pseudo, tmp)
#        }
#      }
#      
#      # Create pseudo score for multi-source methods
#      scores_i_j_multi_pseudo<-NULL
#      for(i in 1:nrow(scores_i_j_multi)) {
#        param <- find_pseudo(
#          config = config,
#          gwas = scores_i_j_multi$name[i],
#          pgs_method = scores_i_j_multi$method[i],
#          target_pop = targ_pop_i
#        )
#        
#        score_header <-
#          fread(scores_i_j_multi$predictor[i], nrows = 1)
#        score_cols <-
#          which(names(score_header) %in% c('FID', 'IID', paste0(scores_i_j_multi$name[i], '_', param)))
#
#        system(
#          paste0(
#            "cut -d' ' -f ", 
#            paste0(score_cols, collapse=','),
#            " ", 
#            scores_i_j_multi$predictor[i], 
#            " > ", 
#            gsub('.profiles',
#                 paste0('.pseudo.targ_', targ_pop_i,'.profiles'),
#                 scores_i_j_multi$predictor[i])
#          )
#        )
#        
#        tmp <- scores_i_j_multi[i,]
#        tmp$multi <- paste0(tmp$multi, '.pseudo')
#        tmp$top1 <- paste0('EUR_', disc_pop_j_2)
#        tmp$predictor <-
#            gsub('.profiles',
#                 paste0('.pseudo.targ_', targ_pop_i,'.profiles'),
#                 scores_i_j_multi$predictor[i])
#        
#        scores_i_j_multi_pseudo <- rbind(scores_i_j_multi_pseudo, tmp)
#      }
      
      ####
      # Combine the different predictor groups
      ####
      predictors_i<- do.call(rbind, list(
        scores_i_j_single_top1, 
        scores_i_j_single_pseudo#, 
#        scores_i_j_multi_single_pseudo,
#        scores_i_j_multi_top1,
#        scores_i_j_multi_pop_pseudo,
#        scores_i_j_multi_pseudo
      ))
      
      predictors_i <- predictors_i[, c('predictor', 'multi','top1'), with=F]
      
      write.table(
        predictors_i,
        paste0(
          '/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_1kgref/targ_',
          targ_pop_i,
          '.disc_EUR_',
          disc_pop_j_2,
          '/',
          trait_i,
          '/predictor_list.txt'
        ),
        col.names = T,
        row.names = F,
        quote = F
      )
    }
  }
}

Run model_builder

cd /users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline
conda activate model_builder

#rm /users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_1kgref/targ_*.disc_EUR_*/*/res*

for targ_pop in $(echo EAS AFR); do
  if [ "$targ_pop" == "EUR" ]; then
      targ_pop2="EUR_test"
  else
      targ_pop2=$targ_pop
  fi
  
  if [ "$targ_pop" == "EUR" ]; then
    disc_pop=$(echo EAS AFR)
  fi
  
  if [ "$targ_pop" == "EAS" ]; then
    disc_pop="EAS"
  fi
  
  if [ "$targ_pop" == "AFR" ]; then
    disc_pop="AFR"
  fi
  
  for disc_pop_i in ${disc_pop}; do
    for pheno in $(cat /users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt); do
      if [ ! -f "/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_1kgref/targ_${targ_pop}.disc_EUR_${disc_pop_i}/${pheno}/res.pred_comp.txt" ]; then
        sbatch --mem 10G -n 5 -p neurohack_cpu,interruptible_cpu -t 1:00:00 --wrap="Rscript ../Scripts/model_builder/model_builder_top1.R \
          --outcome /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/${pheno}.unrel.${targ_pop2}.row_number.txt \
          --predictors /users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_1kgref/targ_${targ_pop}.disc_EUR_${disc_pop_i}/${pheno}/predictor_list.txt \
          --out /users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_1kgref/targ_${targ_pop}.disc_EUR_${disc_pop_i}/${pheno}/res \
          --n_core 5"
      fi
    done
  done
done

Plot results

setwd('/users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline/')

library(data.table)
library(ggplot2)
library(cowplot)

source('../functions/misc.R')
source_all('../functions')

# Read in list of outcomes 
selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', header=F)$V1
info_all <- fread('~/oliverpainfel/Analyses/crosspop/gwas_descriptives.csv')

# Calculate correlation between all phenotypes in each target population
cors <- list()
for(pop_i in c('EUR','EAS','AFR','CSA','AMR')){
  if(pop_i == 'EUR'){
    pop_i_2 <- 'EUR_test'
  } else {
    pop_i_2 <- pop_i
  }
  pheno_pop_i <- list()
  for(pheno_i in selected_traits){
    pheno_pop_i[[pheno_i]] <- fread(paste0('/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/', pheno_i, '.unrel.', pop_i_2, '.row_number.txt'))
    names(pheno_pop_i[[pheno_i]])[3] <- pheno_i
  }
  
  pheno_pop_i_merged <- merged_df <- Reduce(function(x, y) merge(x, y, all = TRUE, by = c('FID','IID')), pheno_pop_i)

  cors_i <- abs(cor(as.matrix(pheno_pop_i_merged[,-1:-2, with=F]), use='p'))
  cors[[pop_i]] <- cors_i
}

# Read in results
targ_pop = c('EAS','AFR')
res_eval <- list()
for(pheno_i in selected_traits){
  res_eval_i<-NULL
  for(targ_pop_i in targ_pop){
    if(targ_pop_i == 'EAS'){
      disc_pop <- 'EAS'
    }
    if(targ_pop_i == 'AFR'){
      disc_pop <- 'AFR'
    }
    if(targ_pop_i == 'EUR'){
      disc_pop <- c('EAS','AFR')
    }
    for(disc_pop_i in disc_pop){
      eval_i <-
        fread(
          paste0(
            '/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_1kgref/',
            'targ_',
            targ_pop_i,
            '.disc_EUR_',
            disc_pop_i,
            '/',
            pheno_i,
            '/res.pred_eval.txt'
          )
        )
      eval_i$Target<-targ_pop_i
      eval_i$gwas_group<-paste0('EUR+', disc_pop_i)
      res_eval_i<-rbind(res_eval_i, eval_i)
    }
  }
  
  res_eval_i$Method<-sub('\\..*','',res_eval_i$Group)
  res_eval_i$Method<-gsub('-.*','', res_eval_i$Method)
  
  res_eval_i$Model[grepl('top1$', res_eval_i$Group) &
                   !grepl('pseudo', res_eval_i$Group)]<-'IndivTune'
  res_eval_i$Model[grepl('top1$', res_eval_i$Group) &
                   grepl('pseudo', res_eval_i$Group)]<-'SumStatTune'
  res_eval_i$Model[grepl('multi$', res_eval_i$Group) &
                   !grepl('pseudo', res_eval_i$Group)]<-'Multi-IndivTune'
  res_eval_i$Model[grepl('multi$', res_eval_i$Group) &
                   grepl('pseudo', res_eval_i$Group)]<-'Multi-SumStatTune'
  
  res_eval_i$Model[grepl('_multi', res_eval_i$Group)]<-'SumStatTune'
  res_eval_i$Model[res_eval_i$Group == 'prscsx.pseudo.multi']<-'SumStatTune'
  res_eval_i$Model[res_eval_i$Group == 'xwing.pseudo.multi']<-'SumStatTune'
  
  res_eval_i$Source<-ifelse(
    res_eval_i$Method %in% pgs_group_methods | grepl('_multi$', res_eval_i$Method) | 
    !grepl('EUR|EAS|AFR', res_eval_i$Group), 'Multi', 'Single')
  
  res_eval_i$Discovery[grepl('EUR', res_eval_i$Group)] <- 'EUR'
  res_eval_i$Discovery[grepl('EAS', res_eval_i$Group)] <- 'EAS'
  res_eval_i$Discovery[grepl('AFR', res_eval_i$Group)] <- 'AFR'
  res_eval_i$Discovery[res_eval_i$Source == 'Multi'] <- res_eval_i$gwas_group[res_eval_i$Source == 'Multi']
  
  res_eval_i$Method<-factor(res_eval_i$Method, levels=unique(res_eval_i$Method))
  res_eval_i$Model<-factor(res_eval_i$Model, levels=c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
  res_eval_i$Discovery<-factor(res_eval_i$Discovery, levels=c('AFR','EAS','EUR','EUR+AFR','EUR+EAS'))

  # Remove IndivTune and Multi-IndivTune model for groups that contain one score (aka QuickPRS and SBayesRC)
  res_eval_i <- res_eval_i[
    !(res_eval_i$Method %in%  c('quickprs','sbayesrc') & 
      res_eval_i$Model %in% c('IndivTune','Multi-IndivTune')),]
  
  # Remove pseudo model for methods that don't really have one 
  res_eval_i <- res_eval_i[
    !(res_eval_i$Method %in%  c('ptclump','ptclump_multi') & 
      res_eval_i$Model %in% c('SumStatTune','Multi-SumStatTune')),]

  # Remove top1 models for *-Multi, PRS-CSx, X-wing
  res_eval_i <- res_eval_i[
    !((res_eval_i$Method %in%  c('prscsx', 'xwing') | grepl('_multi$', res_eval_i$Method)) & 
      grepl('top1', res_eval_i$Group)),]
  
  # Remove any duplicate models
  res_eval_i <- res_eval_i[!duplicated(res_eval_i[, c(
    "Target", "Method", "Model", "Source", "Discovery","gwas_group"
  )]),]
  
  res_eval[[pheno_i]]<-res_eval_i
  
}

# Create vector defining or of methods in plots
model_order <- c("DBSLMM", "lassosum", "LDpred2", "MegaPRS", "PRS-CS", "pT+clump", "QuickPRS", "SBayesRC", "DBSLMM-multi", "lassosum-multi", "LDpred2-multi", "MegaPRS-multi", "PRS-CS-multi", "pT+clump-multi", "QuickPRS-multi", "SBayesRC-multi", "PRS-CSx", "X-Wing", "All") 

res_eval_simp <- NULL
for(pheno_i in selected_traits){
  tmp <- res_eval[[pheno_i]]
  tmp$Trait <- pheno_i
  
  # Insert nice PGS method names
  tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x = T)
  tmp$label[is.na(tmp$label)] <- 'All'
  tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'] <- paste0(tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'], '-multi')
  tmp$label <- factor(tmp$label, levels = model_order)
  
  # Simplify result to either SumStatTune or IndivTune
  tmp$Model[tmp$Model != 'SumStatTune'] <- 'IndivTune'
  tmp$Model[tmp$Model == 'SumStatTune'] <- 'SumStatTune'
  tmp <- tmp[!duplicated(tmp[, c('label','Target','Discovery','Model'), with=F]),]
  
  res_eval_simp <- rbind(res_eval_simp, tmp)
}

####
# Average results across phenotypes
####

library(MAd)

# Average R across phenotypes
meta_res_eval <- NULL
for(targ_pop_i in targ_pop){
  if(targ_pop_i == 'EAS'){
    disc_pop <- 'EAS'
  }
  if(targ_pop_i == 'AFR'){
    disc_pop <- 'AFR'
  }
  if(targ_pop_i == 'EUR'){
    disc_pop <- c('EAS','AFR')
  }
  for(disc_pop_i in disc_pop){
  
    # Subset res_eval for each scenario
    res_eval_i <- do.call(rbind, lapply(seq_along(res_eval), function(i) {
      x <- res_eval[[i]]
      x$pheno <- names(res_eval)[i]
      x <- x[x$Target == targ_pop_i]
      x <- x[x$gwas_group == paste0('EUR+', disc_pop_i)]
    }))
    
    # Average res_evalults for each test across phenotypes
    # Use MAd to account for correlation between them
    res_eval_i$Sample<-'A'
  
    for(group_i in unique(res_eval_i$Group)){
      res_eval_group_i <- res_eval_i[res_eval_i$Group == group_i,]
      missing_pheno <-
        colnames(cors[[targ_pop_i]])[!(colnames(cors[[targ_pop_i]]) %in% unique(res_eval_group_i$pheno))]
      
      if (!all(colnames(cors[[targ_pop_i]]) %in% unique(res_eval_group_i$pheno))) {
        print(paste0(
          'res_evalults missing for ',
          targ_pop_i,
          ' ',
          group_i,
          ' ',
          paste0(missing_pheno, collapse = ' ')
        ))
      }
      
      cors_i <- cors[[targ_pop_i]][unique(res_eval_group_i$pheno), unique(res_eval_group_i$pheno)]
      
      meta_res_eval_i <-
        agg(
          id = Sample,
          es = R,
          var = SE ^ 2,
          cor = cors_i,
          method = "BHHR",
          mod = NULL,
          data = res_eval_group_i
        )
      
      tmp <- data.table(Group = group_i,
                        Method = res_eval_group_i$Method[1],
                        Model = res_eval_group_i$Model[1],
                        Source = res_eval_group_i$Source[1],
                        Discovery = res_eval_group_i$Discovery[1],
                        gwas_group = res_eval_group_i$gwas_group[1],
                        Target = targ_pop_i,
                        R = meta_res_eval_i$es,
                        SE = sqrt(meta_res_eval_i$var))
      
      meta_res_eval <- rbind(meta_res_eval, tmp)
    }
  }
}

meta_res_eval$Model<-factor(meta_res_eval$Model, levels=c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
meta_res_eval$Discovery<-factor(meta_res_eval$Discovery, levels=c('AFR','EAS','EUR','EUR+AFR','EUR+EAS'))

# Plot average performance across phenotypes for AFR and EAS targets
tmp <- meta_res_eval
tmp <- tmp[tmp$Target != 'EUR',]
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'] <- paste0(tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'], '-multi')
tmp$label <- factor(tmp$label, levels = model_order)
tmp$Discovery_clean <- as.character(tmp$Discovery)
tmp$Discovery_clean[tmp$Discovery == 'EUR'] <- 'EUR GWAS'
tmp$Discovery_clean[tmp$Discovery != 'EUR' & tmp$Source == 'Single'] <- 'Target-matched GWAS'
tmp$Discovery_clean[tmp$Discovery != 'EUR' & tmp$Source == 'Multi'] <- 'Both'
tmp$Discovery_clean <- factor(tmp$Discovery_clean, 
                              levels = c('Target-matched GWAS',
                                         'EUR GWAS',
                                         'Both'))
tmp$Target <- paste0(tmp$Target, ' Target')
tmp$Model[tmp$Model != 'SumStatTune'] <- 'IndivTune'
tmp$Model[tmp$Model == 'SumStatTune'] <- 'SumStatTune'
tmp <- tmp[!duplicated(tmp[, c('label','Target','Discovery_clean','Model'), with=F]),]

png(paste0('~/oliverpainfel/Analyses/crosspop/sensitivity_1kgrefplots/average_r.png'), res=300, width = 3200, height = 2000, units = 'px')
ggplot(tmp, aes(x=label, y=R , fill = Model)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x='Method') +
    facet_grid(Target ~ Discovery_clean, scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
          legend.position = "top",
          legend.key.spacing.x = unit(1, "cm"),
          legend.justification = "center")
dev.off()

# The results look very similar to when using 1KG+HGDP. 

###################
# Plot a comparison between the runs using different references

# Read in results using 1KG+HGDP reference
main_results<-fread('~/oliverpainfel/Analyses/crosspop/r_eval.csv')
sens_results<-meta_res_eval

tmp <- main_results
tmp <- tmp[tmp$Target != 'EUR',]
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'] <- paste0(tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'], '-multi')
tmp$label <- factor(tmp$label, levels = model_order)
tmp$Discovery_clean <- as.character(tmp$Discovery)
tmp$Discovery_clean[tmp$Discovery == 'EUR'] <- 'EUR GWAS'
tmp$Discovery_clean[tmp$Discovery != 'EUR' & tmp$Source == 'Single'] <- 'Target-matched GWAS'
tmp$Discovery_clean[tmp$Discovery != 'EUR' & tmp$Source == 'Multi'] <- 'Both'
tmp$Discovery_clean <- factor(tmp$Discovery_clean, 
                              levels = c('Target-matched GWAS',
                                         'EUR GWAS',
                                         'Both'))
tmp$Target <- paste0(tmp$Target, ' Target')
tmp$Model[tmp$Model != 'SumStatTune'] <- 'IndivTune'
tmp$Model[tmp$Model == 'SumStatTune'] <- 'SumStatTune'
main_results <- tmp[!duplicated(tmp[, c('label','Target','Discovery_clean','Model'), with=F]),]

tmp <- sens_results
tmp <- tmp[tmp$Target != 'EUR',]
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'] <- paste0(tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'], '-multi')
tmp$label <- factor(tmp$label, levels = model_order)
tmp$Discovery_clean <- as.character(tmp$Discovery)
tmp$Discovery_clean[tmp$Discovery == 'EUR'] <- 'EUR GWAS'
tmp$Discovery_clean[tmp$Discovery != 'EUR' & tmp$Source == 'Single'] <- 'Target-matched GWAS'
tmp$Discovery_clean[tmp$Discovery != 'EUR' & tmp$Source == 'Multi'] <- 'Both'
tmp$Discovery_clean <- factor(tmp$Discovery_clean, 
                              levels = c('Target-matched GWAS',
                                         'EUR GWAS',
                                         'Both'))
tmp$Target <- paste0(tmp$Target, ' Target')
tmp$Model[tmp$Model != 'SumStatTune'] <- 'IndivTune'
tmp$Model[tmp$Model == 'SumStatTune'] <- 'SumStatTune'
sens_results <- tmp[!duplicated(tmp[, c('label','Target','Discovery_clean','Model'), with=F]),]

main_results<-main_results[main_results$Method %in% sens_results$Method,]
main_results<-main_results[main_results$Target %in% sens_results$Target,]

sens_results$Reference <- '1KG'
main_results$Reference <- '1KG+HGDP'

both_results <- rbind(main_results, sens_results)

png('~/oliverpainfel/Analyses/crosspop/sensitivity_1kgrefplots/comparison_to_main_result.png', units = 'px', res = 300, width=4000, height=2500)
ggplot(both_results, aes(x=label, y=R , fill = Model)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x='Method') +
    facet_grid(Target ~ Discovery_clean + Reference, scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
          legend.position = "top",
          legend.key.spacing.x = unit(1, "cm"),
          legend.justification = "center")
dev.off()

Show results

Using three GWAS

Extend analysis to include gwas_groups including AFR EAS and EUR GWAS. Only some multi-source methods should be applicable here, including LEOPARD, PRS-CSx, and X-Wing. Given X-Wing with LEOPARD is slow, limit X-Wing analysis to the IndivTune model alone.

PGS calculation

Show code

Prepare configuration

library(data.table)

# Subset original gwas_list to include selected traits
gwas_list<-fread('/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_all.txt')
pheno<-gsub('_.*','', gwas_list$name)
selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', header=F)$V1
gwas_list<-gwas_list[pheno %in% selected_traits,]
gwas_list$label<-paste0('"', gwas_list$label, '"')

######
# gwas_groups
######

gwas_groups_three_pop<-data.frame(
  name=paste0(selected_traits, '_UKB_BBJ_UGR'),
  gwas=sapply(selected_traits, function(x) paste0(x,'_UKB,',x,'_BBJ,',x,'_UGR')),
  label=paste0('"', selected_traits, " (UKB+BBJ+UGR)", '"')
)

write.table(gwas_groups_three_pop, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_groups_three_pop.txt', col.names = T, row.names = F, quote = F)

######
# config
######

config<-c(
  "outdir: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/output",
  "config_file: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_three_pop.yaml",
  "gwas_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list.txt",
  "target_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/basic/target_list.txt",
  "gwas_groups: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_groups_three_pop.txt",
  "pgs_methods: ['ptclump','quickprs','dbslmm','lassosum','megaprs','prscs','ldpred2','sbayesrc','prscsx']", # xwing removed for time sake
  "leopard_methods: ['ptclump','quickprs','dbslmm','lassosum','megaprs','prscs','ldpred2','sbayesrc']",
  "cores_prep_pgs: 10", # xwing run with 20 cores
  "cores_target_pgs: 50",
  "ldpred2_inference: F",
  "ldpred2_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/ldpred2/hm3",
  "quickprs_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/quickprs/hm3",
  "quickprs_multi_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/quickprs/hm3_subset",
  "sbayesrc_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/sbayesrc/hm3"
)

write.table(config, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_three_pop.yaml', col.names = F, row.names = F, quote = F)

Run pipeline

snakemake \
  --profile slurm \
  --use-conda \
  --configfile=/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_three_pop.yaml \
  target_pgs  -n

PGS evaluation

Show code

Create predictor list

setwd('~/oliverpainfel/Software/MyGit/GenoPred/pipeline/')
source('../functions/misc.R')
source_all('../functions')
library(data.table)

# Get some key variables from config
config_2_pop<-'/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config.yaml'
config_3_pop<-'/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_three_pop.yaml'
pgs_methods <- read_param(config = config_3_pop, param = 'pgs_methods', return_obj = F)
outdir <- read_param(config = config_3_pop, param = 'outdir', return_obj = F)

# Read in list of outcomes 
selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', header=F)$V1

# Get a list of score files
scores_2_pop <- list_score_files(config_2_pop)
scores_3_pop <- list_score_files(config_3_pop)
scores <- rbind(scores_2_pop, scores_3_pop)
scores <- scores[!duplicated(scores),]

# Remove xwing 
scores <- scores[scores$method != 'xwing', ]

# Create files for EAS and AFR targets
targ_pop <- c('EAS','AFR')
for(trait_i in selected_traits){
  scores_j <- scores[grepl(trait_i, scores$name),]
  scores_j$multi <- scores_j$method
  
  for(targ_pop_i in targ_pop){
    disc_pop_j <- c('UGR','BBJ','UKB')
    disc_pop_j_2 <- c('AFR','EAS','EUR')

    dir.create(
      paste0(
        '/users/k1806347/oliverpainfel/Analyses/crosspop/targ_',
        targ_pop_i,
        '.disc_', paste(disc_pop_j_2, collapse = '_'),
        '/',
        trait_i
      ),
      recursive = T
    )
    
    # Insert path to score file
    scores_i <- scores_j[!grepl(paste0('UKB_', disc_pop_j[!(disc_pop_j_2 %in% c('EUR', targ_pop_i))], '$'), scores_j$name),]
    
    scores_i$predictor <- paste0(
      outdir,
      '/ukb/pgs/TRANS/',
      scores_i$method,
      '/',
      scores_i$name,
      '/ukb-',
      scores_i$name,
      '-TRANS.profiles'
    )
    
    ####
    # Make groups single source methods
    ####
    
    scores_i_single_top1 <-
      scores_i[!(scores_i$method %in% pgs_group_methods) &
                   !grepl('_multi$', scores_i$method), ]

    # Create top1 column indicating which predictors top1 models should be derived
    scores_i_single_top1$top1[grepl('UKB', scores_i_single_top1$name, ignore.case = F)] <- 'EUR'
    scores_i_single_top1$top1[grepl('BBJ', scores_i_single_top1$name, ignore.case = F)] <- 'EAS'
    scores_i_single_top1$top1[grepl('UGR', scores_i_single_top1$name, ignore.case = F)] <- 'AFR'
        
    ####
    # Make groups containing pseudo scores for single source methods
    ####

    # Extract the pseudo score for each method and specify as a separate group
    # This can be skipped as it was done before
    for(i in 1:nrow(scores_i_single_top1)) {
      param <- find_pseudo(
        config = ifelse(scores_i_single_top1$name[i] %in% scores_2_pop$name, config_2_pop, config_3_pop),
        gwas = scores_i_single_top1$name[i],
        pgs_method = scores_i_single_top1$method[i],
        target_pop = targ_pop_i
      )
      
      score_header <-
        fread(scores_i_single_top1$predictor[i], nrows = 1)
      score_cols <-
        which(names(score_header) %in% c('FID', 'IID', paste0(scores_i_single_top1$name[i], '_', param)))
      
      system(
        paste0(
          "cut -d' ' -f ", 
          paste0(score_cols, collapse=','),
          " ", 
          scores_i_single_top1$predictor[i], 
          " > ", 
          gsub('.profiles',
               paste0('.', targ_pop_i, '_pseudo.profiles'),
               scores_i_single_top1$predictor[i])
        )
      )
    }
    
    scores_i_single_pseudo <- scores_i_single_top1
    scores_i_single_pseudo$multi <- paste0(scores_i_single_pseudo$multi, '.pseudo')

    scores_i_single_pseudo$predictor <- gsub('.profiles', 
                                  paste0('.', targ_pop_i, '_pseudo.profiles'),
                                  scores_i_single_pseudo$predictor)

    ####
    # Make groups for multi-single-source pseudo scores
    ####
    
    scores_i_multi_single_pseudo <- scores_i[grepl('_multi$', scores_i$method),]

    # Extract the pseudo score for each method and specify as a separate group
    for(i in 1:nrow(scores_i_multi_single_pseudo)) {
      param <- find_pseudo(
        config = ifelse(scores_i_multi_single_pseudo$name[i] %in% scores_2_pop$name, config_2_pop, config_3_pop),
        gwas = scores_i_multi_single_pseudo$name[i],
        pgs_method = scores_i_multi_single_pseudo$method[i],
        target_pop = targ_pop_i
      )
      
      score_header <-
        fread(scores_i_multi_single_pseudo$predictor[i], nrows = 1)
      score_cols <-
        which(names(score_header) %in% c('FID', 'IID', paste0(scores_i_multi_single_pseudo$name[i], '_', param)))
      
      system(
        paste0(
          "cut -d' ' -f ", 
          paste0(score_cols, collapse=','),
          " ", 
          scores_i_multi_single_pseudo$predictor[i], 
          " > ", 
          gsub('.profiles',
               paste0('.', targ_pop_i, '_pseudo.profiles'),
               scores_i_multi_single_pseudo$predictor[i])
        )
      )
    }
    
    scores_i_multi_single_pseudo$multi <- paste0(scores_i_multi_single_pseudo$multi, '.pseudo')

    scores_i_multi_single_pseudo$predictor <- gsub('.profiles', 
                                  paste0('.', targ_pop_i, '_pseudo.profiles'),
                                  scores_i_multi_single_pseudo$predictor)
    
    scores_i_multi_single_pseudo$top1<-paste(disc_pop_j_2, collapse = '_')

    ####
    # Make groups for the Multi-Source methods
    ####
    
    scores_i_multi <- scores_i[(scores_i$method %in% pgs_group_methods),]

    # Split top1 scores by target population
    # This doesn't apply to xwing because it only has pop-specific pseudo scores
    scores_i_multi_top1<-NULL
    for(i in which(scores_i_multi$method %in% c('prscsx'))){
      score_header<-fread(scores_i_multi$predictor[i], nrow = 1)
      
      pops <- gsub(paste0(trait_i, '_'),'', scores_i_multi$name[i])
      pops <- unlist(strsplit(pops, '_'))
      pops <- disc_pop_j_2[disc_pop_j %in% pops]

      for(pop in pops){
        
        if(scores_i_multi$method[i] == 'prscsx'){
          score_cols <-
            which(grepl(paste0('^FID$|^IID$|_', pop, '_phi'), names(score_header)))
        }
        if(scores_i_multi$method[i] == 'xwing'){
          score_cols <-
            which(grepl(paste0('^FID$|^IID$|_targ_', pop, '_pst'), names(score_header)))
        }
        
        system(
          paste0(
            "cut -d' ' -f ", 
            paste0(score_cols, collapse=','),
            " ", 
            scores_i_multi$predictor[i], 
            " > ", 
            gsub('.profiles',
                 paste0('.', pop, '_grid.profiles'),
                 scores_i_multi$predictor[i])
          )
        )
        
        tmp <- scores_i_multi[i,]
        tmp$multi <- paste0(tmp$multi, '.grid')
        tmp$top1 <- pop
        tmp$predictor <-
            gsub('.profiles',
                 paste0('.', pop, '_grid.profiles'),
                 scores_i_multi$predictor[i])
        
        scores_i_multi_top1 <- rbind(scores_i_multi_top1, tmp)
      }
    }

    # Split pop-specific pseudo scores by target population
    scores_i_multi_pop_pseudo<-NULL
    for(i in 1:nrow(scores_i_multi)){
      score_header<-fread(scores_i_multi$predictor[i], nrow = 1)
      
      pops <- gsub(paste0(trait_i, '_'),'', scores_i_multi$name[i])
      pops <- unlist(strsplit(pops, '_'))
      pops <- disc_pop_j_2[disc_pop_j %in% pops]

      for(pop in pops){
        if(scores_i_multi$method[i] == 'prscsx'){
          score_cols <-
            which(grepl(paste0('^FID$|^IID$|_', pop, '_phi_auto'), names(score_header)))
        }
        if(scores_i_multi$method[i] == 'xwing'){
          score_cols <-
            which(grepl(paste0('^FID$|^IID$|_targ_', pop, '_pst_', pop), names(score_header)))
        }
        
        system(
          paste0(
            "cut -d' ' -f ", 
            paste0(score_cols, collapse=','),
            " ", 
            scores_i_multi$predictor[i], 
            " > ", 
            gsub('.profiles',
                 paste0('.', pop, '_pseudo.profiles'),
                 scores_i_multi$predictor[i])
          )
        )
        
        tmp <- scores_i_multi[i,]
        tmp$multi <- paste0(tmp$multi, '.pop_pseudo')
        tmp$top1 <- pop
        tmp$predictor <-
            gsub('.profiles',
                 paste0('.', pop, '_pseudo.profiles'),
                 scores_i_multi$predictor[i])
        
        scores_i_multi_pop_pseudo <- rbind(scores_i_multi_pop_pseudo, tmp)
      }
    }
    
    # Create pseudo score for multi-source methods
    scores_i_multi_pseudo<-NULL
    for(i in 1:nrow(scores_i_multi)) {
      param <- find_pseudo(
        config = ifelse(scores_i_multi$name[i] %in% scores_2_pop$name, config_2_pop, config_3_pop),
        gwas = scores_i_multi$name[i],
        pgs_method = scores_i_multi$method[i],
        target_pop = targ_pop_i
      )
      
      score_header <-
        fread(scores_i_multi$predictor[i], nrows = 1)
      score_cols <-
        which(names(score_header) %in% c('FID', 'IID', paste0(scores_i_multi$name[i], '_', param)))

      system(
        paste0(
          "cut -d' ' -f ", 
          paste0(score_cols, collapse=','),
          " ", 
          scores_i_multi$predictor[i], 
          " > ", 
          gsub('.profiles',
               paste0('.pseudo.targ_', targ_pop_i,'.profiles'),
               scores_i_multi$predictor[i])
        )
      )
      
      tmp <- scores_i_multi[i,]
      tmp$multi <- paste0(tmp$multi, '.pseudo')
      tmp$top1 <- paste(disc_pop_j_2, collapse = '_')
      tmp$predictor <-
          gsub('.profiles',
               paste0('.pseudo.targ_', targ_pop_i,'.profiles'),
               scores_i_multi$predictor[i])
      
      scores_i_multi_pseudo <- rbind(scores_i_multi_pseudo, tmp)
    }
    
    ####
    # Combine the different predictor groups
    ####
    predictors_i<- do.call(rbind, list(
      scores_i_single_top1, 
      scores_i_single_pseudo, 
      scores_i_multi_single_pseudo,
      scores_i_multi_top1,
      scores_i_multi_pop_pseudo,
      scores_i_multi_pseudo
    ))
    
    ####
    # Make a group that will combined all population specific PGS
    ####
    
    predictors_i_all <- predictors_i[predictors_i$top1 %in% c('EUR','AFR','EAS'),]
    predictors_i_all$multi <- 'all'
    predictors_i<-rbind(predictors_i, predictors_i_all)
    
    ####
    # Split into pairwise groups (2 pop groups)
    ####
    
    afr_eur <- predictors_i[!grepl('BBJ', predictors_i$name),]
    afr_eur$multi <- paste0(afr_eur$multi, '.EUR_AFR')
    afr_eur$top1[afr_eur$top1 == 'AFR_EAS_EUR'] <- 'AFR_EUR'
    
    eas_eur <- predictors_i[!grepl('UGR', predictors_i$name),]
    eas_eur$multi <- paste0(eas_eur$multi, '.EUR_EAS')
    eas_eur$top1[eas_eur$top1 == 'AFR_EAS_EUR'] <- 'EAS_EUR'

    one_or_three <- predictors_i[!grepl('UKB_BBJ$', predictors_i$name) &
                                   !grepl('UKB_UGR$', predictors_i$name),]
    
    predictors_clean <- do.call(rbind, list(
      afr_eur, eas_eur, one_or_three
    ))
    predictors_clean <- predictors_clean[, c('predictor', 'multi','top1'), with=F]

        
    write.table(
      predictors_clean,
      paste0(
        '/users/k1806347/oliverpainfel/Analyses/crosspop/targ_',
        targ_pop_i,
        '.disc_', paste(disc_pop_j_2, collapse = '_'),
        '/',
        trait_i,
        '/predictor_list.txt'
      ),
      col.names = T,
      row.names = F,
      quote = F
    )
  }
}

Run model_builder

cd /users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline
conda activate model_builder

#rm /users/k1806347/oliverpainfel/Analyses/crosspop/targ_*.disc_AFR_EAS_EUR/*/res*

for targ_pop in $(echo EAS AFR); do
  if [ "$targ_pop" == "EUR" ]; then
      targ_pop2="EUR_test"
  else
      targ_pop2=$targ_pop
  fi
  
  for pheno in $(cat /users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt); do
    if [ ! -f "/users/k1806347/oliverpainfel/Analyses/crosspop/targ_${targ_pop}.disc_AFR_EAS_EUR/${pheno}/res.pred_comp.txt" ]; then
      sbatch --mem 10G -n 1 -p neurohack_cpu,interruptible_cpu -t 1:00:00 --wrap="Rscript ../Scripts/model_builder/model_builder_top1.R \
        --outcome /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/${pheno}.unrel.${targ_pop2}.row_number.txt \
        --predictors /users/k1806347/oliverpainfel/Analyses/crosspop/targ_${targ_pop}.disc_AFR_EAS_EUR/${pheno}/predictor_list.txt \
        --out /users/k1806347/oliverpainfel/Analyses/crosspop/targ_${targ_pop}.disc_AFR_EAS_EUR/${pheno}/res \
        --n_core 1"
    fi
  done
done

Plot results

setwd('/users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline/')

library(data.table)
library(ggplot2)
library(cowplot)

source('../functions/misc.R')
source_all('../functions')

# Read in list of outcomes 
selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', header=F)$V1
info_all <- fread('~/oliverpainfel/Analyses/crosspop/gwas_descriptives.csv')

# Calculate correlation between all phenotypes in each target population
cors <- list()
for(pop_i in c('EUR','EAS','AFR','CSA','AMR')){
  if(pop_i == 'EUR'){
    pop_i_2 <- 'EUR_test'
  } else {
    pop_i_2 <- pop_i
  }
  pheno_pop_i <- list()
  for(pheno_i in selected_traits){
    pheno_pop_i[[pheno_i]] <- fread(paste0('/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/', pheno_i, '.unrel.', pop_i_2, '.row_number.txt'))
    names(pheno_pop_i[[pheno_i]])[3] <- pheno_i
  }
  
  pheno_pop_i_merged <- merged_df <- Reduce(function(x, y) merge(x, y, all = TRUE, by = c('FID','IID')), pheno_pop_i)

  cors_i <- abs(cor(as.matrix(pheno_pop_i_merged[,-1:-2, with=F]), use='p'))
  cors[[pop_i]] <- cors_i
}

# Read in results
targ_pop = c('EAS','AFR')
res_eval <- list()
for(pheno_i in selected_traits){
  res_eval_i<-NULL
  for(targ_pop_i in targ_pop){
    if(targ_pop_i == 'EAS'){
      disc_pop <- 'EAS'
    }
    if(targ_pop_i == 'AFR'){
      disc_pop <- 'AFR'
    }
    eval_i <-
      fread(
        paste0(
          '/users/k1806347/oliverpainfel/Analyses/crosspop/',
          'targ_',
          targ_pop_i,
          '.disc_AFR_EAS_EUR/',
          pheno_i,
          '/res.pred_eval.txt'
        )
      )
    eval_i$Target<-targ_pop_i
    eval_i$gwas_group <- 'EUR+AFR+EAS'
    res_eval_i<-rbind(res_eval_i, eval_i)
  }
  
  res_eval_i$Method<-sub('\\..*','',res_eval_i$Group)
  res_eval_i$Method<-gsub('-.*','', res_eval_i$Method)
  
  res_eval_i$Model[grepl('top1$', res_eval_i$Group) &
                   !grepl('pseudo', res_eval_i$Group)]<-'IndivTune'
  res_eval_i$Model[grepl('top1$', res_eval_i$Group) &
                   grepl('pseudo', res_eval_i$Group)]<-'SumStatTune'
  res_eval_i$Model[grepl('multi$', res_eval_i$Group) &
                   !grepl('pseudo', res_eval_i$Group)]<-'Multi-IndivTune'
  res_eval_i$Model[grepl('multi$', res_eval_i$Group) &
                   grepl('pseudo', res_eval_i$Group)]<-'Multi-SumStatTune'
  
  res_eval_i$Model[grepl('_multi', res_eval_i$Group)]<-'SumStatTune'
  res_eval_i$Model[grepl('prscsx.pseudo', res_eval_i$Group)]<-'SumStatTune'

  res_eval_i$Source<-ifelse(
    res_eval_i$Method %in% pgs_group_methods | grepl('_multi$', res_eval_i$Method) | 
    !grepl('-EUR|-EAS|-AFR', res_eval_i$Group), 'Multi', 'Single')
  
  res_eval_i$Group <- gsub('\\.multi', '-multi', res_eval_i$Group)
  res_eval_i$Group_short <- gsub('.*\\.', '', gsub('-.*', '', res_eval_i$Group)) 
  res_eval_i$n_gwas <- 3
  res_eval_i$n_gwas[grepl('EUR_', res_eval_i$Group_short)] <- 2
  res_eval_i$n_gwas[res_eval_i$Source == 'Single'] <- 1

  res_eval_i$Discovery[grepl('-EUR', res_eval_i$Group)] <- 'EUR'
  res_eval_i$Discovery[grepl('-EAS', res_eval_i$Group)] <- 'EAS'
  res_eval_i$Discovery[grepl('-AFR', res_eval_i$Group)] <- 'AFR'
  res_eval_i$Discovery[res_eval_i$Source == 'Multi'] <- res_eval_i$gwas_group[res_eval_i$Source == 'Multi']
  res_eval_i$Discovery[res_eval_i$n_gwas == 2] <- gsub('_', '+', res_eval_i$Group_short[res_eval_i$n_gwas == 2])
  
  res_eval_i$Method<-factor(res_eval_i$Method, levels=unique(res_eval_i$Method))
  res_eval_i$Model<-factor(res_eval_i$Model, levels=c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
  res_eval_i$Discovery<-factor(res_eval_i$Discovery, levels=c('AFR','EAS','EUR','EUR+AFR','EUR+EAS','EUR+AFR+EAS'))

  # Remove IndivTune and Multi-IndivTune model for groups that contain one score (aka QuickPRS and SBayesRC)
  res_eval_i <- res_eval_i[
    !(res_eval_i$Method %in%  c('quickprs','sbayesrc') & 
      res_eval_i$Model %in% c('IndivTune','Multi-IndivTune')),]
  
  # Remove pseudo model for methods that don't really have one 
  res_eval_i <- res_eval_i[
    !(res_eval_i$Method %in%  c('ptclump','ptclump_multi') & 
      res_eval_i$Model %in% c('SumStatTune','Multi-SumStatTune')),]

  # Remove top1 models for *-Multi, PRS-CSx, X-wing
  res_eval_i <- res_eval_i[
    !((res_eval_i$Method %in%  c('prscsx', 'xwing') | grepl('_multi$', res_eval_i$Method)) & 
      grepl('top1', res_eval_i$Group)),]
  
  # Remove any duplicate models
  res_eval_i <- res_eval_i[!duplicated(res_eval_i[, c(
    "Target", "Method", "Model", "Source", "Discovery","gwas_group"
  )]),]
  
  res_eval[[pheno_i]]<-res_eval_i
  
}

# Create vector defining or of methods in plots
model_order <- c("DBSLMM", "lassosum", "LDpred2", "MegaPRS", "PRS-CS", "pT+clump", "QuickPRS", "SBayesRC", "DBSLMM-multi", "lassosum-multi", "LDpred2-multi", "MegaPRS-multi", "PRS-CS-multi", "pT+clump-multi", "QuickPRS-multi", "SBayesRC-multi", "PRS-CSx", "X-Wing", "All") 

res_eval_simp <- NULL
for(pheno_i in selected_traits){
  tmp <- res_eval[[pheno_i]]
  tmp$Trait <- pheno_i
  
  # Insert nice PGS method names
  tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x = T)
  tmp$label[is.na(tmp$label)] <- 'All'
  tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'] <- paste0(tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'], '-multi')
  tmp$label <- factor(tmp$label, levels = model_order)
  
  # Simplify result to either SumStatTune or IndivTune
  tmp$Model[tmp$Model != 'SumStatTune'] <- 'IndivTune'
  tmp$Model[tmp$Model == 'SumStatTune'] <- 'SumStatTune'
  tmp <- tmp[!duplicated(tmp[, c('label','Target','Discovery','Model'), with=F]),]
  
  res_eval_simp <- rbind(res_eval_simp, tmp)
}

# Plot results for each phenotype separately
dir.create('~/oliverpainfel/Analyses/crosspop/plots_three_pop')

for(pheno_i in selected_traits){
  tmp <- res_eval_simp[res_eval_simp$Trait == pheno_i,]

  # Remove single GWAS results
  tmp <- tmp[tmp$n_gwas != 1,]
  
  # Restrict to target matched + EUR and All GWAS
  tmp <- tmp[!(tmp$Target == 'AFR' & tmp$Discovery == 'EUR+EAS'),]
  tmp <- tmp[!(tmp$Target == 'EAS' & tmp$Discovery == 'EUR+AFR'),]
  tmp$Discovery_clean <- as.character(tmp$Discovery)
  tmp$Discovery_clean[(tmp$Target == 'EAS' & tmp$Discovery == 'EUR+EAS')]<-'Target-matched + EUR GWAS'
  tmp$Discovery_clean[(tmp$Target == 'AFR' & tmp$Discovery == 'EUR+AFR')]<-'Target-matched + EUR GWAS'
  tmp$Discovery_clean[tmp$Discovery == 'EUR+AFR+EAS']<-'AFR + EAS + EUR GWAS'
  tmp$Discovery_clean <- factor(tmp$Discovery_clean, levels = c(
    'Target-matched + EUR GWAS', 'AFR + EAS + EUR GWAS'
  ))
  
  tmp$Target <- paste0(tmp$Target, ' Target')

  png(paste0('~/oliverpainfel/Analyses/crosspop/plots_three_pop/', pheno_i,'.png'), res=300, width = 3400, height = 2000, units = 'px')
  plot_tmp<-ggplot(tmp, aes(x=label, y=R , fill = Model)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x=NULL, fill = NULL, title = info_all$`Trait Description`[info_all$`Trait Label` == pheno_i]) +
    facet_grid(Target ~ Discovery_clean, scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
          legend.position = "top",
          legend.key.spacing.x = unit(1, "cm"),
          legend.justification = "center")
  print(plot_tmp)
  dev.off()
}

####
# Average results across phenotypes
####

library(MAd)

# Average R across phenotypes
meta_res_eval <- NULL
for(targ_pop_i in targ_pop){
  # Subset res_eval for each scenario
  res_eval_i <- do.call(rbind, lapply(seq_along(res_eval), function(i) {
    x <- res_eval[[i]]
    x$pheno <- names(res_eval)[i]
    x <- x[x$Target == targ_pop_i]
  }))
  
  # Average res_evalults for each test across phenotypes
  # Use MAd to account for correlation between them
  res_eval_i$Sample<-'A'

  for(group_i in unique(res_eval_i$Group)){
    res_eval_group_i <- res_eval_i[res_eval_i$Group == group_i,]
    missing_pheno <-
      colnames(cors[[targ_pop_i]])[!(colnames(cors[[targ_pop_i]]) %in% unique(res_eval_group_i$pheno))]
    
    if (!all(colnames(cors[[targ_pop_i]]) %in% unique(res_eval_group_i$pheno))) {
      print(paste0(
        'res_evalults missing for ',
        targ_pop_i,
        ' ',
        group_i,
        ' ',
        paste0(missing_pheno, collapse = ' ')
      ))
    }
    
    cors_i <- cors[[targ_pop_i]][unique(res_eval_group_i$pheno), unique(res_eval_group_i$pheno)]
    
    meta_res_eval_i <-
      agg(
        id = Sample,
        es = R,
        var = SE ^ 2,
        cor = cors_i,
        method = "BHHR",
        mod = NULL,
        data = res_eval_group_i
      )
    
    tmp <- data.table(Group = group_i,
                      Method = res_eval_group_i$Method[1],
                      Model = res_eval_group_i$Model[1],
                      Source = res_eval_group_i$Source[1],
                      Discovery = res_eval_group_i$Discovery[1],
                      gwas_group = res_eval_group_i$gwas_group[1],
                      n_gwas = res_eval_group_i$n_gwas[1],
                      Target = targ_pop_i,
                      R = meta_res_eval_i$es,
                      SE = sqrt(meta_res_eval_i$var))
    
    meta_res_eval <- rbind(meta_res_eval, tmp)
  }
}

meta_res_eval$Model<-factor(meta_res_eval$Model, levels=c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
meta_res_eval$Discovery<-factor(meta_res_eval$Discovery, levels=c('AFR','EAS','EUR','EUR+AFR','EUR+EAS','EUR+AFR+EAS'))

write.csv(meta_res_eval, '~/oliverpainfel/Analyses/crosspop/r_eval_three_pop.csv', row.names = F)

# Plot average performance across phenotypes for AFR and EAS targets
tmp <- meta_res_eval
tmp <- tmp[tmp$Target != 'EUR',]
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'] <- paste0(tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'], '-multi')
tmp$label <- factor(tmp$label, levels = model_order)
tmp$Model[tmp$Model != 'SumStatTune'] <- 'IndivTune'
tmp$Model[tmp$Model == 'SumStatTune'] <- 'SumStatTune'
tmp <- tmp[!duplicated(tmp[, c('label','Target','Discovery','Model'), with=F]),]

# Remove single GWAS results
tmp <- tmp[tmp$n_gwas != 1,]

# Restrict to target matched + EUR and All GWAS
tmp <- tmp[!(tmp$Target == 'AFR' & tmp$Discovery == 'EUR+EAS'),]
tmp <- tmp[!(tmp$Target == 'EAS' & tmp$Discovery == 'EUR+AFR'),]
tmp$Discovery_clean <- as.character(tmp$Discovery)
tmp$Discovery_clean[(tmp$Target == 'EAS' & tmp$Discovery == 'EUR+EAS')]<-'Target-matched + EUR GWAS'
tmp$Discovery_clean[(tmp$Target == 'AFR' & tmp$Discovery == 'EUR+AFR')]<-'Target-matched + EUR GWAS'
tmp$Discovery_clean[tmp$Discovery == 'EUR+AFR+EAS']<-'AFR + EAS + EUR GWAS'
tmp$Discovery_clean <- factor(tmp$Discovery_clean, levels = c(
  'Target-matched + EUR GWAS', 'AFR + EAS + EUR GWAS'
))

tmp$Target <- paste0(tmp$Target, ' Target')

png(paste0('~/oliverpainfel/Analyses/crosspop/plots_three_pop/average_r.png'), res=300, width = 3200, height = 2000, units = 'px')
ggplot(tmp, aes(x=label, y=R , fill = Model)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x='Method') +
    facet_grid(Target ~ Discovery_clean, scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
          legend.position = "top",
          legend.key.spacing.x = unit(1, "cm"),
          legend.justification = "center")
dev.off()

Show results

Using external GWAS sumstats

Here we will use GWAS sumtats that were used in the original GenoPred paper. These GWAS are from a range of sources, often large meta-analyses, which can lead to greater mispecification in the sumstats, which can impact the performance of some PGS methods. This is to provide more confidence in the performance of SBayesRC and QuickPRS relative to other methods.

Note: I am using this opportunity to evaluate lassosum2, which is not included in other analyses in this project.

PGS calculation

We will do this using GenoPred.

Show code

Prepare configuration

We can use the gwas_list from the GenoPred pipeline paper. Just make the new configuration file.

######
# gwas_list
######

library(data.table)
gwas_list <- fread('/scratch/prj/ukbiobank/recovered/Edinburgh_Data/usr/ollie_pain/GenoPredPipe/usr/k1806347/configs/benchmark/gwas_list.txt')

gwas_list$path <- gsub('/scratch/prj/ukbiobank/usr/ollie_pain/GenoPredPipe/usr/k1806347/gwas_sumstats/',
                       '/users/k1806347/oliverpainfel/Data/GWAS_sumstats/genopred_pipeline_paper/',
                       gwas_list$path)

gwas_list$label=paste0('"', gwas_list$label, '"')

write.table(gwas_list, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_meta.txt', col.names = T, row.names = F, quote = F)

######
# config
######

config<-c(
  "outdir: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/output",
  "config_file: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_meta.yaml",
  "gwas_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_meta.txt",
  "target_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/basic/target_list.txt",
  "pgs_methods: ['quickprs','sbayesrc','ldpred2','ptclump','dbslmm','lassosum2']",
#  "pgs_methods: ['ptclump','quickprs','dbslmm','lassosum','megaprs','prscs','ldpred2','sbayesrc']",
  "cores_prep_pgs: 10",
  "cores_target_pgs: 50",
  "ldpred2_inference: F",
  "ldpred2_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/ldpred2/hm3",
  "quickprs_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/quickprs/hm3",
  "quickprs_multi_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/quickprs/hm3_subset",
  "sbayesrc_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/sbayesrc/hm3"
)

write.table(config, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_meta.yaml', col.names = F, row.names = F, quote = F)

Run pipeline

snakemake \
  --profile slurm \
  --use-conda \
  --configfile=/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_meta.yaml \
  target_pgs -n

PGS evaluation

Show code

Create predictor list

setwd('~/oliverpainfel/Software/MyGit/GenoPred/pipeline/')
source('../functions/misc.R')
source_all('../functions')
library(data.table)

# Get some key variables from config
config<-'/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_meta.yaml'
outdir <- read_param(config = config, param = 'outdir', return_obj = F)

# Read in gwas_list
gwas_list<-read_param(config = config, param = 'gwas_list')

# Create column containing the phenotypes corresponding to each GWAS
gwas_list$pheno<-tolower(gwas_list$label)
gwas_list$pheno[gwas_list$pheno == 'breast cancer']<-'bc'
gwas_list$pheno[gwas_list$pheno == 'prostate cancer']<-'pc'
gwas_list$pheno[gwas_list$pheno == 'egfr']<-'egfr,ckd'
gwas_list$pheno[gwas_list$pheno == 'urate levels']<-'urate,gout'
gwas_list$pheno[gwas_list$pheno == 'rheumatoid arthritis']<-'ra'

bin_phenos <- c('bc', 'ckd', 'gout', 'ibd', 'pc', 'ra', 'stroke', 't1d', 't2d')
con_phenos <- c('height','bmi','egfr','hba1c','urate','hdl')
phenos<-c(bin_phenos, con_phenos)

# Get a list of score files
scores <- list_score_files(config)

# Create files for EUR target
for(trait_i in phenos){
  scores_i <- scores[scores$name == gwas_list$name[grepl(paste0('^', trait_i, '$','|', ',', trait_i, '$','|', '^', trait_i, ','), gwas_list$pheno)],]

  dir.create(
    paste0(
      '/users/k1806347/oliverpainfel/Analyses/crosspop/meta/targ_EUR.disc_EUR/', trait_i
    ),
    recursive = T
  )
  
  scores_i$predictor <- paste0(
    outdir,
    '/ukb/pgs/TRANS/',
    scores_i$method,
    '/',
    scores_i$name,
    '/ukb-',
    scores_i$name,
    '-TRANS.profiles'
  )
  
  scores_i$multi<-scores_i$method
  scores_i$top1 <- 'EUR'

  # Extract the pseudo score for each method and specify as a separate group
  for(i in 1:nrow(scores_i)) {
    param <- find_pseudo(
      config = config,
      gwas = scores_i$name[i],
      pgs_method = scores_i$method[i],
      target_pop = 'EUR'
    )
    
    score_header <-
      fread(scores_i$predictor[i], nrows = 1)
    score_cols <-
      which(names(score_header) %in% c('FID', 'IID', paste0(scores_i$name[i], '_', param)))
    
    system(
      paste0(
        "cut -d' ' -f ", 
        paste0(score_cols, collapse=','),
        " ", 
        scores_i$predictor[i], 
        " > ", 
        gsub('.profiles',
             paste0('.EUR_pseudo.profiles'),
             scores_i$predictor[i])
      )
    )
  }
  
  scores_i_pseudo <- scores_i
  scores_i_pseudo$multi <- paste0(scores_i_pseudo$multi, '.pseudo')

  scores_i_pseudo$predictor <- gsub('.profiles', 
                                paste0('.EUR_pseudo.profiles'),
                                scores_i_pseudo$predictor)


  predictors_i<- do.call(rbind, list(
    scores_i, scores_i_pseudo
  ))
  
  predictors_i <- predictors_i[, c('predictor', 'top1','multi'), with=F]
  
  write.table(
    predictors_i,
    paste0(
      '/users/k1806347/oliverpainfel/Analyses/crosspop/meta/targ_EUR.disc_EUR/', trait_i, '/predictor_list.txt'
    ),
    col.names = T,
    row.names = F,
    quote = F
  )
}

########
# Prepare phenotype data
########

# Read in list of EUR in UKB
eur_keep <- fread('/users/k1806347/oliverpainfel/Data/ukb/GenoPred/output/ukb/pcs/within_sample/ukb.outlier_detection.EUR.keep')
names(eur_keep)<-c('FID','IID')

# Update row number IDs to project specific IDs
psam<-fread('/scratch/prj/ukbiobank/recovered/ukb82087/imputed/ukb82087_imp_chr1_MAF1_INFO4_v1.psam')
psam$rn<-1:nrow(psam)
psam<-psam[,c('IID','rn'), with = F]

bin_phenos <- c('bc', 'ckd', 'gout', 'ibd', 'pc', 'ra', 'stroke', 't1d', 't2d')
con_phenos <- c('height','bmi','egfr','hba1c','urate','hdl')
phenos<-c(bin_phenos, con_phenos)

dir.create('~/oliverpainfel/Data/ukb/phenotypes/benchmark')

for(i in phenos){
  # Read in pheno data
  pheno_i <- fread(paste0(
      '/scratch/prj/ukbiobank/recovered/Edinburgh_Data/usr/ollie_pain/phenotypes/',
      i,
      '.unrel.txt'
    ))
  
  names(pheno_i)<-c('IID','PHENO')
  
  # Update to row number based IDs
  pheno_i<-merge(psam[,c('IID','rn'), with=F], pheno_i, by='IID')
  pheno_i$IID<-pheno_i$rn
  pheno_i$rn<-NULL
  pheno_i$FID<-pheno_i$IID
  pheno_i<-pheno_i[, c('FID','IID','PHENO'), with=F]
  
  # Restrict to EUR
  pheno_i <- pheno_i[pheno_i$FID %in% eur_keep$FID,]
  
  # Write file
  write.table(pheno_i, paste0('~/oliverpainfel/Data/ukb/phenotypes/benchmark/', i, '.unrel.eur.txt'), row.names = F, quote = F)
}

Run model_builder

cd /users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline
conda activate model_builder

#rm /users/k1806347/oliverpainfel/Analyses/crosspop/meta/targ_EUR.disc_EUR/*/res*

for pheno in $(echo bc ckd gout ibd pc ra stroke t1d t2d height bmi egfr hba1c urate hdl); do
  if [ ! -f "/users/k1806347/oliverpainfel/Analyses/crosspop/meta/targ_EUR.disc_EUR/${pheno}/res.pred_comp.txt" ]; then
    sbatch --mem 10G -n 5 -p neurohack_cpu,interruptible_cpu -t 1:00:00 --wrap="Rscript ../Scripts/model_builder/model_builder_top1.R \
      --outcome /users/k1806347/oliverpainfel/Data/ukb/phenotypes/benchmark/${pheno}.unrel.eur.txt \
      --predictors /users/k1806347/oliverpainfel/Analyses/crosspop/meta/targ_EUR.disc_EUR/${pheno}/predictor_list.txt \
      --out /users/k1806347/oliverpainfel/Analyses/crosspop/meta/targ_EUR.disc_EUR/${pheno}/res \
      --n_core 5"
  fi
done

Plot results

setwd('/users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline/')

library(data.table)
library(ggplot2)
library(cowplot)

source('../functions/misc.R')
source_all('../functions')

bin_phenos <- c('bc', 'ckd', 'gout', 'ibd', 'pc', 'ra', 'stroke', 't1d', 't2d')
con_phenos <- c('height','bmi','egfr','hba1c','urate','hdl')
phenos<-c(bin_phenos, con_phenos)

# Calculate correlation between all phenotypes in each target population
pheno_pop_i <- list()
for(pheno_i in phenos){
  pheno_pop_i[[pheno_i]] <- fread(paste0('/users/k1806347/oliverpainfel/Data/ukb/phenotypes/benchmark/', pheno_i, '.unrel.eur.txt'))
  names(pheno_pop_i[[pheno_i]])[3] <- pheno_i
}

pheno_pop_i_merged <- merged_df <- Reduce(function(x, y) merge(x, y, all = TRUE, by = c('FID','IID')), pheno_pop_i)

cors <- abs(cor(as.matrix(pheno_pop_i_merged[,-1:-2, with=F]), use='p'))
cors[is.na(cors)]<-0

# Read in results
res_eval <- list()
for(pheno_i in phenos){
  eval_i <-
    fread(
      paste0(
        '/users/k1806347/oliverpainfel/Analyses/crosspop/meta/targ_EUR.disc_EUR/', pheno_i,'/res.pred_eval.txt'
      )
    )

  eval_i$Ncase <- NULL
  eval_i$Ncont <- NULL
  eval_i$R2l <- NULL
  eval_i$R2o <- NULL
  
  eval_i <- eval_i[!grepl('\\.multi', eval_i$Group),]
  
  eval_i$Method<-sub('\\..*','',eval_i$Group)
  eval_i$Method<-gsub('-.*','', eval_i$Method)
  
  eval_i$Model[grepl('top1$', eval_i$Group) &
                   !grepl('pseudo', eval_i$Group)]<-'IndivTune'
  eval_i$Model[grepl('top1$', eval_i$Group) &
                   grepl('pseudo', eval_i$Group)]<-'SumStatTune'

  eval_i$Method<-factor(eval_i$Method, levels=unique(eval_i$Method))
  eval_i$Model<-factor(eval_i$Model, levels=c('IndivTune','SumStatTune'))

  # Remove IndivTune and Multi-IndivTune model for groups that contain one score (aka QuickPRS and SBayesRC)
  eval_i <- eval_i[
    !(eval_i$Method %in%  c('quickprs','sbayesrc') & 
      eval_i$Model %in% c('IndivTune')),]
  
  # Remove pseudo model for methods that don't really have one 
  eval_i <- eval_i[
    !(eval_i$Method %in%  c('ptclump','lassosum2') & 
      eval_i$Model %in% c('SumStatTune')),]
  
  res_eval[[pheno_i]]<-eval_i
  
}

# Create vector defining or of methods in plots
model_order <- c("DBSLMM", "lassosum",'lassosum2', "LDpred2", "MegaPRS", "PRS-CS", "pT+clump", "QuickPRS", "SBayesRC") 

res_eval_simp <- NULL
for(pheno_i in phenos){
  tmp <- res_eval[[pheno_i]]
  tmp$Trait <- pheno_i
  
  # Insert nice PGS method names
  tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x = T)
  tmp$label <- factor(tmp$label, levels = model_order)

  res_eval_simp <- rbind(res_eval_simp, tmp)
}

# Plot results for each phenotype separately
dir.create('~/oliverpainfel/Analyses/crosspop/plots_meta')

png(paste0('~/oliverpainfel/Analyses/crosspop/plots_meta/trait_r.png'), res=100, width = 1200, height = 900, units = 'px')
ggplot(res_eval_simp, aes(x=label, y=R , fill = Model)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x=NULL, fill = NULL) +
    facet_wrap(Trait ~ ., scales = 'free_y') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
          legend.position = "top",
          legend.key.spacing.x = unit(1, "cm"),
          legend.justification = "center")
dev.off()

####
# Average results across phenotypes
####

library(MAd)

# Average R across phenotypes
meta_res_eval <- NULL

# Average res_evalults for each test across phenotypes
# Use MAd to account for correlation between them
res_eval_simp$Sample<-'A'

for(group_i in unique(res_eval_simp$Group)){
  res_eval_group_i <- res_eval_simp[res_eval_simp$Group == group_i,]
  missing_pheno <- colnames(cors)[!(colnames(cors) %in% unique(res_eval_simp$Trait))]
  
  if (!all(colnames(cors) %in% unique(res_eval_simp$Trait))) {
    print(paste0(
      'res_evalults missing for ',
      targ_pop_i,
      ' ',
      group_i,
      ' ',
      paste0(missing_pheno, collapse = ' ')
    ))
  }
  
  cors_i <- cors[unique(res_eval_group_i$Trait), unique(res_eval_group_i$Trait)]
  
  meta_res_eval_simp <-
    agg(
      id = Sample,
      es = R,
      var = SE ^ 2,
      cor = cors_i,
      method = "BHHR",
      mod = NULL,
      data = res_eval_group_i
    )
  
  tmp <- data.table(Group = group_i,
                    Method = res_eval_group_i$Method[1],
                    Model = res_eval_group_i$Model[1],
                    R = meta_res_eval_simp$es,
                    SE = sqrt(meta_res_eval_simp$var))
  
  meta_res_eval <- rbind(meta_res_eval, tmp)
}

meta_res_eval$Model<-factor(meta_res_eval$Model, levels=c('IndivTune','SumStatTune'))

write.csv(meta_res_eval, '~/oliverpainfel/Analyses/crosspop/meta/r_eval.csv', row.names = F)

# Plot average performance across phenotypes for AFR and EAS targets
tmp <- meta_res_eval
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x = T)
tmp$label <- factor(tmp$label, levels = model_order)

png(paste0('~/oliverpainfel/Analyses/crosspop/plots_meta/average_r.png'), res=100, width = 500, height = 300, units = 'px')
ggplot(tmp, aes(x=label, y=R , fill = Model)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x='Method') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
          legend.position = "top",
          legend.key.spacing.x = unit(1, "cm"),
          legend.justification = "center")
dev.off()

# Results and conclusions remain consistent.

Show average results

Show trait-specific results

Using downsampled GWAS

It seems the performance of methods varies across EAS and AFR datasets. This could be due to the difference in sample size. To explore this, lets run the methods on EUR GWAS generated using UKB, using a range of sample sizes.

Downsample GWAS

Show code

library(data.table)

# Read in phenotype file
subsample_n<-c(5, 15, 45, 135)
selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', header=F)$V1
set.seed(1)

dir.create('/users/k1806347/oliverpainfel/Data/ukb/phenotypes/subsample')
for(i in selected_traits){
  pheno_i_dat <- fread(
    paste0(
      '/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/',
      i,
      '.unrel.EUR_train.norm_resid_scale.row_number.txt'
    )
  )
  
  for(n in subsample_n){
    tmp <- pheno_i_dat[sample(1:nrow(pheno_i_dat), size = n*1000),]
    fwrite(tmp, paste0('/users/k1806347/oliverpainfel/Data/ukb/phenotypes/subsample/', i, '.',n,'.txt'),
           sep=' ',
           na='NA',
           quote=F)
  }
}

for pheno in $(cat /users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt); do
  for n in $(echo 5 15 45 135); do
    mkdir -p /users/k1806347/oliverpainfel/Data/ukb/gwas/${pheno}_subsampled
    for chr in $(seq 1 22); do
      if [ ! -s "/users/k1806347/oliverpainfel/Data/ukb/gwas/${pheno}_subsampled/ukb.eur_train.${pheno}.${n}.chr${chr}.outcome.glm.linear" ]; then
      if [ ! -f "/users/k1806347/oliverpainfel/Data/ukb/gwas/${pheno}_subsampled/ukb.eur_train.${pheno}.${n}.GW.txt.gz" ]; then
        sbatch -p interruptible_cpu,cpu,neurohack_cpu --wrap="/users/k1806347/oliverpainfel/Software/plink2 \
          --pfile /users/k1806347/oliverpainfel/Data/ukb/GenoPred/output/ukb/geno/ukb.ref.chr${chr} \
          --pheno /users/k1806347/oliverpainfel/Data/ukb/phenotypes/subsample/${pheno}.${n}.txt \
          --linear omit-ref cols=+a1freq,+ax \
          --maf 0.01 \
          --geno 0.05 \
          --out /users/k1806347/oliverpainfel/Data/ukb/gwas/${pheno}_subsampled/ukb.eur_train.${pheno}.${n}.chr${chr}"
      fi
      fi
    done
  done
done

# Once complete, merge results across chromosomes
for pheno in $(cat /users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt); do
  for n in $(echo 5 15 45 135); do
    if [ ! -f "/users/k1806347/oliverpainfel/Data/ukb/gwas/${pheno}_subsampled/ukb.eur_train.${pheno}.${n}.GW.txt.gz" ]; then

    head -n 1 /users/k1806347/oliverpainfel/Data/ukb/gwas/${pheno}_subsampled/ukb.eur_train.${pheno}.${n}.chr1.outcome.glm.linear > /users/k1806347/oliverpainfel/Data/ukb/gwas/${pheno}_subsampled/ukb.eur_train.${pheno}.${n}.GW.txt
      for chr in $(seq 1 22); do
        tail -n +2 /users/k1806347/oliverpainfel/Data/ukb/gwas/${pheno}_subsampled/ukb.eur_train.${pheno}.${n}.chr${chr}.outcome.glm.linear >> /users/k1806347/oliverpainfel/Data/ukb/gwas/${pheno}_subsampled/ukb.eur_train.${pheno}.${n}.GW.txt
      done
      
      # Remove REF and ALT columns and rename AX column to A2
      cut -f 4,5 --complement /users/k1806347/oliverpainfel/Data/ukb/gwas/${pheno}_subsampled/ukb.eur_train.${pheno}.${n}.GW.txt | awk 'BEGIN{FS=OFS="\t"} NR==1 {$5="A2"} 1' > temp.txt && mv temp.txt /users/k1806347/oliverpainfel/Data/ukb/gwas/${pheno}_subsampled/ukb.eur_train.${pheno}.${n}.GW.txt
  
      gzip /users/k1806347/oliverpainfel/Data/ukb/gwas/${pheno}_subsampled/ukb.eur_train.${pheno}.${n}.GW.txt
    fi
  done
done

# Delete per chromosome files
rm /users/k1806347/oliverpainfel/Data/ukb/gwas/*_subsampled/*chr*

Subset EUR test individuals in UKB data

To make this quicker, focus on evaluating the PGS methods in the EUR test subset in UKB. This will avoid reprocessing the full UKB data.

Show code

library(data.table)

selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', header=F)$V1

pheno_long <- NULL
for(i in selected_traits){
  pheno_i <- fread(paste0(
    '/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/',
    i ,
    '.unrel.EUR_test.row_number.txt'
  ))
  
  pheno_long <- rbind(
    pheno_long, 
    pheno_i
  )
}

test_subset <- unique(pheno_long$FID)
keep <- data.frame(FID = test_subset,
                   IID = test_subset)

write.table(
  keep,
  '~/oliverpainfel/Data/ukb/eur_test.keep',
  row.names = F,
  col.names = F,
  quote = F
)

mkdir ~/oliverpainfel/Data/ukb/eur_test_subset

for chr in $(seq 1 22); do
  sbatch -p interruptible_cpu,cpu,neurohack_cpu -n 1 --mem 5G \
  --wrap="~/oliverpainfel/Software/plink2 \
    --pfile ~/oliverpainfel/Data/ukb/GenoPred/output/ukb/geno/ukb.ref.chr${chr} \
    --keep ~/oliverpainfel/Data/ukb/eur_test.keep \
    --make-pgen \
    --out ~/oliverpainfel/Data/ukb/eur_test_subset/ukb.chr${chr}"
done

PGS calculation

Show code

Prepare configuration

######
# gwas_list
######

gwas_list<-fread('/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list.txt')
pheno<-gsub('_.*','', gwas_list$name)
gwas_list<-gwas_list[gwas_list$population == 'EUR',]
gwas_list$pheno<-gsub('_UKB','',gwas_list$name)

gwas_list_subsampled <- NULL
for(n in c(5, 15, 45, 135)){
  gwas_list_tmp<-gwas_list
  
  gwas_list_tmp$name <-
    paste0(
      gwas_list_tmp$name, '_', n, 'K'
    )
  gwas_list_tmp$path <-
    paste0(
      '/users/k1806347/oliverpainfel/Data/ukb/gwas/',
      gwas_list_tmp$pheno,
      '_subsampled/ukb.eur_train.',
      gwas_list_tmp$pheno,
      '.',
      n,
      '.GW.txt.gz'
    )
  gwas_list_tmp$label <-
    paste0(
      gsub("\\)", paste0(' - ', n,"K)"), gwas_list_tmp$label)
    )
  
  gwas_list_subsampled <- rbind(gwas_list_subsampled, gwas_list_tmp)
}

gwas_list_subsampled$pheno<-NULL

gwas_list_subsampled$label<-paste0('"', gwas_list_subsampled$label, '"')

write.table(gwas_list_subsampled, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_subsampled.txt', col.names = T, row.names = F, quote = F)

######
# target_list
######
target_list <- data.frame(
  name='ukb',
  path='/users/k1806347/oliverpainfel/Data/ukb/eur_test_subset/ukb',
  type='plink2',
  indiv_report=F,
  unrel='/users/k1806347/oliverpainfel/Data/ukb/phenotypes/unrelated.row_number.txt'
)

dir.create('/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/eur_test_only')

write.table(target_list, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/eur_test_only/target_list.txt', col.names=T, row.names=F, quote=F)

######
# config
######

config<-c(
  "outdir: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/output_eur_test_only",
  "config_file: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_subsampled.yaml",
  "gwas_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_subsampled.txt",
  "target_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/eur_test_only/target_list.txt",
  "pgs_methods: ['quickprs','ptclump','dbslmm','ldpred2','sbayesrc']",
  "cores_prep_pgs: 10",
  "cores_target_pgs: 50",
  "ldpred2_inference: F",
  "ldpred2_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/ldpred2/hm3",
  "quickprs_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/quickprs/hm3",
  "quickprs_multi_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/quickprs/hm3_subset",
  "sbayesrc_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/sbayesrc/hm3"
)

write.table(config, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_subsampled.yaml', col.names = F, row.names = F, quote = F)

####
# Make a second configuration using EAS GWAS and subsample EUR GWAS with QuickPRS and LEOPARD
####

gwas_groups<-data.frame(
  name=paste0(gwas_list_subsampled$name, '_BBJ'),
  gwas=paste0(gwas_list_subsampled$name),
  label=gsub("\\)", " + BBJ)", gwas_list_subsampled$label)
)

gwas_groups$trait <- gsub('_.*','',gwas_groups$name)
gwas_groups$gwas<-paste0(gwas_groups$gwas,',',gwas_groups$trait,'_BBJ')

gwas_groups$trait<-NULL

write.table(gwas_groups, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_groups_subsampled.txt', col.names = T, row.names = F, quote = F)

gwas_list<-fread('/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list.txt')
gwas_list_eas<-gwas_list[gwas_list$population == 'EAS',]
gwas_list_eas$label<-paste0('"', gwas_list_eas$label, '"')
gwas_list_subsampled <- rbind(gwas_list_subsampled, gwas_list_eas)

write.table(gwas_list_subsampled, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_subsampled.txt', col.names = T, row.names = F, quote = F)

config<-c(
  "outdir: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/output_eur_test_only",
  "config_file: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_subsampled.yaml",
  "gwas_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_subsampled.txt",
  "gwas_groups: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_groups_subsampled.txt",
  "target_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/eur_test_only/target_list.txt",
  "pgs_methods: ['quickprs']",
  "leopard_methods: ['quickprs']",
  "cores_prep_pgs: 10",
  "cores_target_pgs: 50",
  "ldpred2_inference: F",
  "ldpred2_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/ldpred2/hm3",
  "quickprs_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/quickprs/hm3",
  "quickprs_multi_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/quickprs/hm3_subset",
  "sbayesrc_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/sbayesrc/hm3"
)

write.table(config, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_subsampled.yaml', col.names = F, row.names = F, quote = F)

Run pipeline

snakemake \
  --profile slurm \
  --use-conda \
  --configfile=/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_subsampled.yaml \
  target_pgs -n

PGS evaluation

Show code

Create predictor list

setwd('~/oliverpainfel/Software/MyGit/GenoPred/pipeline/')
source('../functions/misc.R')
source_all('../functions')
library(data.table)

# Get some key variables from config
config<-'/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_subsampled.yaml'
pgs_methods <- read_param(config = config, param = 'pgs_methods', return_obj = F)
outdir <- read_param(config = config, param = 'outdir', return_obj = F)

# Read in list of outcomes 
selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', header=F)$V1

# Get a list of score files
scores <- list_score_files(config)

# Get list of score files using full EUR GWAS
config_full<-'/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config.yaml'
outdir_full <- read_param(config = config_full, param = 'outdir', return_obj = F)
scores_full <- list_score_files(config_full)
scores_full <- scores_full[grepl('UKB$', scores_full$name),]
scores_full <- scores_full[scores_full$method %in% pgs_methods,]

# Create files for EAS and AFR targets
for(trait_i in selected_traits){
  
  dir.create(
    paste0(
      '/users/k1806347/oliverpainfel/Analyses/crosspop/subsampled/targ_EUR.disc_EUR/', trait_i
    ),
    recursive = T
  )
  
  scores_i <- scores[grepl(paste0('^', trait_i,'_'), scores$name),]

  scores_i$predictor <- paste0(
    outdir,
    '/ukb/pgs/TRANS/',
    scores_i$method,
    '/',
    scores_i$name,
    '/ukb-',
    scores_i$name,
    '-TRANS.profiles'
  )
  
  scores_i$top1 <- paste0(scores_i$method,'.',gsub('.*_', '', scores_i$name))

  # Now for full GWAS
  scores_full_i <- scores_full[grepl(paste0('^', trait_i,'_'), scores_full$name),]

  scores_full_i$predictor <- paste0(
    outdir_full,
    '/ukb/pgs/TRANS/',
    scores_full_i$method,
    '/',
    scores_full_i$name,
    '/ukb-',
    scores_full_i$name,
    '-TRANS.profiles'
  )
  
  scores_full_i$top1 <- paste0(scores_full_i$method,'.full')

  ####
  # Make groups containing pseudo scores for single source methods
  ####

  # Extract the pseudo score for each method and specify as a separate group
  # This can be skipped as it was done before
  for(i in 1:nrow(scores_i)) {
    param <- find_pseudo(
      config = config,
      gwas = scores_i$name[i],
      pgs_method = scores_i$method[i],
      target_pop = 'EUR'
    )
    
    score_header <-
      fread(scores_i$predictor[i], nrows = 1)
    score_cols <-
      which(names(score_header) %in% c('FID', 'IID', paste0(scores_i$name[i], '_', param)))
    
    system(
      paste0(
        "cut -d' ' -f ", 
        paste0(score_cols, collapse=','),
        " ", 
        scores_i$predictor[i], 
        " > ", 
        gsub('.profiles',
             paste0('.EUR_pseudo.profiles'),
             scores_i$predictor[i])
      )
    )
  }
  
  scores_i_pseudo <- scores_i
  scores_i_pseudo$top1<-paste0(scores_i_pseudo$top1,'.pseudo')

  scores_i_pseudo$predictor <- gsub('.profiles', 
                                paste0('.EUR_pseudo.profiles'),
                                scores_i_pseudo$predictor)

  # Now for full GWAS - skip subsetting pseudo as done before
  scores_full_i_pseudo <- scores_full_i
  scores_full_i_pseudo$top1<-paste0(scores_full_i_pseudo$top1,'.pseudo')
  
  scores_full_i_pseudo$predictor <- gsub('.profiles', 
                                paste0('.EUR_pseudo.profiles'),
                                scores_full_i_pseudo$predictor)

  ####
  # Combine the different predictor groups
  ####
  predictors_i<- do.call(rbind, list(
    scores_i, 
    scores_full_i, 
    scores_i_pseudo, 
    scores_full_i_pseudo
  ))
  predictors_i$multi <- 'All'
  predictors_i <- predictors_i[, c('predictor', 'multi','top1'), with=F]
      
  write.table(
    predictors_i,
    paste0(
      '/users/k1806347/oliverpainfel/Analyses/crosspop/subsampled/targ_EUR.disc_EUR/', trait_i, '/predictor_list.txt'
    ),
    col.names = T,
    row.names = F,
    quote = F
  )
}

Run model_builder

cd /users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline
conda activate model_builder

#rm /users/k1806347/oliverpainfel/Analyses/crosspop/subsampled/targ_EUR.disc_EUR/*/res*

for pheno in $(cat /users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt); do
  if [ ! -f "/users/k1806347/oliverpainfel/Analyses/crosspop/subsampled/targ_EUR.disc_EUR/${pheno}/res.pred_comp.txt" ]; then
    sbatch --mem 10G -n 1 -p neurohack_cpu,interruptible_cpu,cpu -t 1:00:00 --wrap="Rscript ../Scripts/model_builder/model_builder_top1.R \
      --outcome /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/${pheno}.unrel.EUR_test.row_number.txt \
      --predictors /users/k1806347/oliverpainfel/Analyses/crosspop/subsampled/targ_EUR.disc_EUR/${pheno}/predictor_list.txt \
      --out /users/k1806347/oliverpainfel/Analyses/crosspop/subsampled/targ_EUR.disc_EUR/${pheno}/res \
      --n_core 1"
  fi
done

Create predictor list

When using EUR and EAS GWAS.

setwd('~/oliverpainfel/Software/MyGit/GenoPred/pipeline/')
source('../functions/misc.R')
source_all('../functions')
library(data.table)

# Get some key variables from config
config<-'/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_subsampled.yaml'
pgs_methods <- read_param(config = config, param = 'pgs_methods', return_obj = F)
outdir <- read_param(config = config, param = 'outdir', return_obj = F)

# Read in list of outcomes 
selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', header=F)$V1

# Get a list of score files
scores <- list_score_files(config)

# Create files for EAS and AFR targets
targ_pop_i <- 'EUR'
disc_pop_j <-'BBJ'
disc_pop_j_2 <-'EAS'

for(trait_i in selected_traits){
  dir.create(paste0(
        '/users/k1806347/oliverpainfel/Analyses/crosspop/subsampled/targ_',
        targ_pop_i,
        '.disc_EUR_',
        disc_pop_j_2,
        '/',
        trait_i))
        
  for(n in c(5, 15, 45, 135)){
    scores_i <- scores[grepl(paste0('^', trait_i,'_'), scores$name),]
    scores_i <- scores_i[grepl(paste0('_', n, 'K_BBJ|_', n, 'K|',trait_i,'_BBJ'), scores_i$name),]
  
    scores_i$predictor <- paste0(
      outdir,
      '/ukb/pgs/TRANS/',
      scores_i$method,
      '/',
      scores_i$name,
      '/ukb-',
      scores_i$name,
      '-TRANS.profiles'
    )
    
    scores_i$multi <- scores_i$method

    scores_i_j <- scores_i

    # Insert path to score file
    scores_i_j$predictor <- paste0(
      outdir,
      '/ukb/pgs/TRANS/',
      scores_i_j$method,
      '/',
      scores_i_j$name,
      '/ukb-',
      scores_i_j$name,
      '-TRANS.profiles'
    )
    
    ####
    # Make groups single source methods
    ####
    
    scores_i_j_single_top1 <-
      scores_i_j[!(scores_i_j$method %in% pgs_group_methods) &
                   !grepl('_multi$', scores_i_j$method), ]

    # Create top1 column indicating which predictors top1 models should be derived
    scores_i_j_single_top1$top1[grepl('UKB', scores_i_j_single_top1$name, ignore.case = F)] <- 'EUR'
    scores_i_j_single_top1$top1[grepl(disc_pop_j, scores_i_j_single_top1$name, ignore.case = F)] <- disc_pop_j_2
    
    ####
    # Make groups containing pseudo scores for single source methods
    ####

    # Extract the pseudo score for each method and specify as a separate group
    for(i in 1:nrow(scores_i_j_single_top1)) {
      param <- find_pseudo(
        config = config,
        gwas = scores_i_j_single_top1$name[i],
        pgs_method = scores_i_j_single_top1$method[i],
        target_pop = targ_pop_i
      )
      
      score_header <-
        fread(scores_i_j_single_top1$predictor[i], nrows = 1)
      score_cols <-
        which(names(score_header) %in% c('FID', 'IID', paste0(scores_i_j_single_top1$name[i], '_', param)))
      
      system(
        paste0(
          "cut -d' ' -f ", 
          paste0(score_cols, collapse=','),
          " ", 
          scores_i_j_single_top1$predictor[i], 
          " > ", 
          gsub('.profiles',
               paste0('.', targ_pop_i, '_pseudo.profiles'),
               scores_i_j_single_top1$predictor[i])
        )
      )
    }
    
    scores_i_j_single_pseudo <- scores_i_j_single_top1
    scores_i_j_single_pseudo$multi <- paste0(scores_i_j_single_pseudo$multi, '.pseudo')

    scores_i_j_single_pseudo$predictor <- gsub('.profiles', 
                                  paste0('.', targ_pop_i, '_pseudo.profiles'),
                                  scores_i_j_single_pseudo$predictor)

    ####
    # Make groups for multi-single-source pseudo scores
    ####
    
    scores_i_j_multi_single_pseudo <- scores_i_j[grepl('_multi$', scores_i_j$method),]

    # Extract the pseudo score for each method and specify as a separate group
    for(i in 1:nrow(scores_i_j_multi_single_pseudo)) {
      param <- find_pseudo(
        config = config,
        gwas = scores_i_j_multi_single_pseudo$name[i],
        pgs_method = scores_i_j_multi_single_pseudo$method[i],
        target_pop = targ_pop_i
      )
      
      score_header <-
        fread(scores_i_j_multi_single_pseudo$predictor[i], nrows = 1)
      score_cols <-
        which(names(score_header) %in% c('FID', 'IID', paste0(scores_i_j_multi_single_pseudo$name[i], '_', param)))
      
      system(
        paste0(
          "cut -d' ' -f ", 
          paste0(score_cols, collapse=','),
          " ", 
          scores_i_j_multi_single_pseudo$predictor[i], 
          " > ", 
          gsub('.profiles',
               paste0('.', targ_pop_i, '_pseudo.profiles'),
               scores_i_j_multi_single_pseudo$predictor[i])
        )
      )
    }
    
    scores_i_j_multi_single_pseudo$multi <- paste0(scores_i_j_multi_single_pseudo$multi, '.pseudo')

    scores_i_j_multi_single_pseudo$predictor <- gsub('.profiles', 
                                  paste0('.', targ_pop_i, '_pseudo.profiles'),
                                  scores_i_j_multi_single_pseudo$predictor)
    
    scores_i_j_multi_single_pseudo$top1<-paste0('EUR_', disc_pop_j_2)

    ####
    # Combine the different predictor groups
    ####
    predictors_i<- do.call(rbind, list(
      scores_i_j_single_top1, 
      scores_i_j_single_pseudo, 
      scores_i_j_multi_single_pseudo
    ))
    
    predictors_i <- predictors_i[, c('predictor', 'multi','top1'), with=F]
    
    write.table(
      predictors_i,
      paste0(
        '/users/k1806347/oliverpainfel/Analyses/crosspop/subsampled/targ_',
        targ_pop_i,
        '.disc_EUR_',
        disc_pop_j_2,
        '/',
        trait_i,
        '/predictor_list_n', n, '.txt'
      ),
      col.names = T,
      row.names = F,
      quote = F
    )
  }
}

Run model_builder

cd /users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline
conda activate model_builder

#rm /users/k1806347/oliverpainfel/Analyses/crosspop/subsampled/targ_EUR.disc_EUR_EAS/*/res*

for pheno in $(cat /users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt); do
  for n in $(echo 5 15 45 135); do
    if [ ! -f "/users/k1806347/oliverpainfel/Analyses/crosspop/subsampled/targ_EUR.disc_EUR_EAS/${pheno}/res_n${n}.pred_comp.txt" ]; then
      sbatch --mem 10G -n 1 -p neurohack_cpu,interruptible_cpu,cpu -t 1:00:00 --wrap="Rscript ../Scripts/model_builder/model_builder_top1.R \
        --outcome /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/${pheno}.unrel.EUR_test.row_number.txt \
        --predictors /users/k1806347/oliverpainfel/Analyses/crosspop/subsampled/targ_EUR.disc_EUR_EAS/${pheno}/predictor_list_n${n}.txt \
        --out /users/k1806347/oliverpainfel/Analyses/crosspop/subsampled/targ_EUR.disc_EUR_EAS/${pheno}/res_n${n} \
        --n_core 1"
    fi
  done
done

Plot results

setwd('/users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline/')

library(data.table)
library(ggplot2)
library(cowplot)

source('../functions/misc.R')
source_all('../functions')

selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', header=F)$V1

# Calculate correlation between all phenotypes in each target population
pheno_pop_i <- list()
for(pheno_i in selected_traits){
  pheno_pop_i[[pheno_i]] <- fread(paste0('/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/', pheno_i, '.unrel.EUR_test.row_number.txt'))
  names(pheno_pop_i[[pheno_i]])[3] <- pheno_i
}

pheno_pop_i_merged <- merged_df <- Reduce(function(x, y) merge(x, y, all = TRUE, by = c('FID','IID')), pheno_pop_i)

cors <- abs(cor(as.matrix(pheno_pop_i_merged[,-1:-2, with=F]), use='p'))

# Read in results
res_eval <- list()
for(pheno_i in selected_traits){
  eval_i <-
    fread(
      paste0(
        '/users/k1806347/oliverpainfel/Analyses/crosspop/subsampled/targ_EUR.disc_EUR/', pheno_i,'/res.pred_eval.txt'
      )
    )

  eval_i$Ncase <- NULL
  eval_i$Ncont <- NULL
  eval_i$R2l <- NULL
  eval_i$R2o <- NULL
  
  eval_i <- eval_i[!grepl('\\.multi', eval_i$Group),]
  
  eval_i$Method<-sub('\\..*','',eval_i$Group)
  eval_i$Method<-gsub('.*-','', eval_i$Method)
  
  eval_i$GWAS_N <- gsub('K\\..*','',eval_i$Group)
  eval_i$GWAS_N <- gsub('.*\\.','',eval_i$GWAS_N)
  eval_i$GWAS_N <- paste0(eval_i$GWAS_N,'K')
  eval_i$GWAS_N[eval_i$GWAS_N == 'top1K'] <- '297K'
  
  eval_i$Model[grepl('top1$', eval_i$Group) &
                   !grepl('pseudo', eval_i$Group)]<-'IndivTune'
  eval_i$Model[grepl('top1$', eval_i$Group) &
                   grepl('pseudo', eval_i$Group)]<-'SumStatTune'

  eval_i$Method<-factor(eval_i$Method, levels=unique(eval_i$Method))
  eval_i$Model<-factor(eval_i$Model, levels=c('IndivTune','SumStatTune'))

  # Remove IndivTune and Multi-IndivTune model for groups that contain one score (aka QuickPRS and SBayesRC)
  eval_i <- eval_i[
    !(eval_i$Method %in%  c('quickprs','sbayesrc') & 
      eval_i$Model %in% c('IndivTune')),]
  
  # Remove pseudo model for methods that don't really have one 
  eval_i <- eval_i[
    !(eval_i$Method %in%  c('ptclump') & 
      eval_i$Model %in% c('SumStatTune')),]
  
  res_eval[[pheno_i]]<-eval_i
  
}

# Create vector defining or of methods in plots
model_order <- c("DBSLMM", "lassosum", "LDpred2", "MegaPRS", "PRS-CS", "pT+clump", "QuickPRS", "SBayesRC") 

res_eval_simp <- NULL
for(pheno_i in selected_traits){
  tmp <- res_eval[[pheno_i]]
  tmp$Trait <- pheno_i
  
  # Insert nice PGS method names
  tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x = T)
  tmp$label <- factor(tmp$label, levels = model_order)

  res_eval_simp <- rbind(res_eval_simp, tmp)
}

# Plot results for each phenotype separately
dir.create('~/oliverpainfel/Analyses/crosspop/plots_downsample')

tmp<-res_eval_simp
tmp$GWAS_N <- paste0('GWAS N = ', tmp$GWAS_N)
tmp$GWAS_N <-factor(tmp$GWAS_N, levels = unique(tmp$GWAS_N))

png(paste0('~/oliverpainfel/Analyses/crosspop/plots_downsample/per_trait_r.png'), res=100, width = 1000, height = 6000, units = 'px')
ggplot(tmp, aes(x=label, y=R , fill = Model)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x=NULL, fill = NULL) +
    facet_grid(Trait ~ GWAS_N, scales = 'free_y') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
          legend.position = "top",
          legend.key.spacing.x = unit(1, "cm"),
          legend.justification = "center")
dev.off()

####
# Average results across phenotypes
####

library(MAd)

# Average R across phenotypes
meta_res_eval <- NULL
# Subset res_eval for each scenario
res_eval_i <- do.call(rbind, lapply(seq_along(res_eval), function(i) {
  x <- res_eval[[i]]
  x$pheno <- names(res_eval)[i]
  return(x)
}))

# Average res_evalults for each test across phenotypes
# Use MAd to account for correlation between them
res_eval_i$Sample<-'A'

for(group_i in unique(res_eval_i$Group)){
  res_eval_group_i <- res_eval_i[res_eval_i$Group == group_i,]
  missing_pheno <-
    colnames(cors)[!(colnames(cors) %in% unique(res_eval_group_i$pheno))]
  
  cors_i <- cors[unique(res_eval_group_i$pheno), unique(res_eval_group_i$pheno)]
  
  meta_res_eval_i <-
    agg(
      id = Sample,
      es = R,
      var = SE ^ 2,
      cor = cors_i,
      method = "BHHR",
      mod = NULL,
      data = res_eval_group_i
    )
  
  tmp <- data.table(Group = group_i,
                    Method = res_eval_group_i$Method[1],
                    Model = res_eval_group_i$Model[1],
                    Source = res_eval_group_i$Source[1],
                    Discovery = 'EUR',
                    Target = 'EUR',
                    R = meta_res_eval_i$es,
                    SE = sqrt(meta_res_eval_i$var),
                    GWAS_N = res_eval_group_i$GWAS_N[1])
  
  meta_res_eval <- rbind(meta_res_eval, tmp)
}

tmp <- meta_res_eval
# Insert nice PGS method names
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x = T)
tmp$label <- factor(tmp$label, levels = model_order)
tmp$GWAS_N <- paste0('GWAS N = ', tmp$GWAS_N)
tmp$GWAS_N <-factor(tmp$GWAS_N, levels = unique(tmp$GWAS_N))

png(paste0('~/oliverpainfel/Analyses/crosspop/plots_downsample/average_r.png'), res=100, width = 1000, height = 600, units = 'px')
ggplot(tmp, aes(x=label, y=R , fill = Model)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x=NULL, fill = NULL) +
    facet_grid(~ GWAS_N, scales = 'free_y') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
          legend.position = "top",
          legend.key.spacing.x = unit(1, "cm"),
          legend.justification = "center")
dev.off()

Plot results

setwd('/users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline/')

library(data.table)
library(ggplot2)
library(cowplot)

source('../functions/misc.R')
source_all('../functions')

# Read in list of outcomes 
selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', header=F)$V1
info_all <- fread('~/oliverpainfel/Analyses/crosspop/gwas_descriptives.csv')

# Calculate correlation between all phenotypes in each target population
cors <- list()
for(pop_i in c('EUR','EAS','AFR','CSA','AMR')){
  if(pop_i == 'EUR'){
    pop_i_2 <- 'EUR_test'
  } else {
    pop_i_2 <- pop_i
  }
  pheno_pop_i <- list()
  for(pheno_i in selected_traits){
    pheno_pop_i[[pheno_i]] <- fread(paste0('/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/', pheno_i, '.unrel.', pop_i_2, '.row_number.txt'))
    names(pheno_pop_i[[pheno_i]])[3] <- pheno_i
  }
  
  pheno_pop_i_merged <- merged_df <- Reduce(function(x, y) merge(x, y, all = TRUE, by = c('FID','IID')), pheno_pop_i)

  cors_i <- abs(cor(as.matrix(pheno_pop_i_merged[,-1:-2, with=F]), use='p'))
  cors[[pop_i]] <- cors_i
}

# Read in results
targ_pop_i = 'EUR'
disc_pop_i = 'EAS'
res_eval <- list()
for(pheno_i in selected_traits){
  res_eval_i <- NULL
  for (n_i in c(5, 15, 45, 135)) {
    eval_i <-
      fread(
        paste0(
          '/users/k1806347/oliverpainfel/Analyses/crosspop/subsampled/',
          'targ_',
          targ_pop_i,
          '.disc_EUR_',
          disc_pop_i,
          '/',
          pheno_i,
          '/res_n', n_i, '.pred_eval.txt'
        )
      )
    eval_i$Target<-targ_pop_i
    eval_i$gwas_group<-paste0('EUR+', disc_pop_i)
    eval_i$UKB_GWAS_N <- paste0(n_i,'k')
    res_eval_i<-rbind(res_eval_i, eval_i)
  }
  
  res_eval_i$Method<-sub('\\..*','',res_eval_i$Group)
  res_eval_i$Method<-gsub('-.*','', res_eval_i$Method)
  
  res_eval_i$Model[grepl('top1$', res_eval_i$Group) &
                   !grepl('pseudo', res_eval_i$Group)]<-'IndivTune'
  res_eval_i$Model[grepl('top1$', res_eval_i$Group) &
                   grepl('pseudo', res_eval_i$Group)]<-'SumStatTune'
  res_eval_i$Model[grepl('multi$', res_eval_i$Group) &
                   !grepl('pseudo', res_eval_i$Group)]<-'Multi-IndivTune'
  res_eval_i$Model[grepl('multi$', res_eval_i$Group) &
                   grepl('pseudo', res_eval_i$Group)]<-'Multi-SumStatTune'
  
  res_eval_i$Model[grepl('_multi', res_eval_i$Group)]<-'SumStatTune'
  res_eval_i$Model[res_eval_i$Group == 'prscsx.pseudo.multi']<-'SumStatTune'
  res_eval_i$Model[res_eval_i$Group == 'xwing.pseudo.multi']<-'SumStatTune'
  
  res_eval_i$Source<-ifelse(
    res_eval_i$Method %in% pgs_group_methods | grepl('_multi$', res_eval_i$Method) | 
    !grepl('EUR|EAS|AFR', res_eval_i$Group), 'Multi', 'Single')
  
  res_eval_i$Discovery[grepl('EUR', res_eval_i$Group)] <- 'EUR'
  res_eval_i$Discovery[grepl('EAS', res_eval_i$Group)] <- 'EAS'
  res_eval_i$Discovery[grepl('AFR', res_eval_i$Group)] <- 'AFR'
  res_eval_i$Discovery[res_eval_i$Source == 'Multi'] <- res_eval_i$gwas_group[res_eval_i$Source == 'Multi']
  
  res_eval_i$Method<-factor(res_eval_i$Method, levels=unique(res_eval_i$Method))
  res_eval_i$Model<-factor(res_eval_i$Model, levels=c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
  res_eval_i$Discovery<-factor(res_eval_i$Discovery, levels=c('AFR','EAS','EUR','EUR+AFR','EUR+EAS'))

  # Remove IndivTune and Multi-IndivTune model for groups that contain one score (aka QuickPRS and SBayesRC)
  res_eval_i <- res_eval_i[
    !(res_eval_i$Method %in%  c('quickprs','sbayesrc') & 
      res_eval_i$Model %in% c('IndivTune','Multi-IndivTune')),]
  
  # Remove pseudo model for methods that don't really have one 
  res_eval_i <- res_eval_i[
    !(res_eval_i$Method %in%  c('ptclump','ptclump_multi') & 
      res_eval_i$Model %in% c('SumStatTune','Multi-SumStatTune')),]

  # Remove top1 models for *-Multi, PRS-CSx, X-wing
  res_eval_i <- res_eval_i[
    !((res_eval_i$Method %in%  c('prscsx', 'xwing') | grepl('_multi$', res_eval_i$Method)) & 
      grepl('top1', res_eval_i$Group)),]
  
  # Remove any duplicate models
  res_eval_i <- res_eval_i[!duplicated(res_eval_i[, c(
    "Target", "Method", "Model", "Source", "Discovery","gwas_group", "UKB_GWAS_N"
  )]),]
  
  res_eval[[pheno_i]]<-res_eval_i
  
}

# Create vector defining or of methods in plots
model_order <- c("DBSLMM", "lassosum", "LDpred2", "MegaPRS", "PRS-CS", "pT+clump", "QuickPRS", "SBayesRC", "DBSLMM-multi", "lassosum-multi", "LDpred2-multi", "MegaPRS-multi", "PRS-CS-multi", "pT+clump-multi", "QuickPRS-multi", "SBayesRC-multi", "PRS-CSx", "X-Wing", "All") 

# Plot results for each phenotype separately
dir.create('~/oliverpainfel/Analyses/crosspop/subsampled/plots')

res_eval_simp <- NULL
for(pheno_i in selected_traits){
  tmp <- res_eval[[pheno_i]]
  tmp$Trait <- pheno_i
  
  # Insert nice PGS method names
  tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x = T)
  tmp$label[is.na(tmp$label)] <- 'All'
  tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'] <- paste0(tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'], '-multi')
  tmp$label <- factor(tmp$label, levels = model_order)
  
  # Simplify result to either SumStatTune or IndivTune
  tmp$Model[tmp$Model != 'SumStatTune'] <- 'IndivTune'
  tmp$Model[tmp$Model == 'SumStatTune'] <- 'SumStatTune'
  tmp <- tmp[!duplicated(tmp[, c('label','Target','Discovery','Model'), with=F]),]
  
  res_eval_simp <- rbind(res_eval_simp, tmp)
}

####
# Average results across phenotypes
####

library(MAd)

# Average R across phenotypes
meta_res_eval <- NULL
for (n_i in c(5, 15, 45, 135)) {
  # Subset res_eval for each scenario
  res_eval_i <- do.call(rbind, lapply(seq_along(res_eval), function(i) {
    x <- res_eval[[i]]
    x$pheno <- names(res_eval)[i]
    x <- x[x$Target == targ_pop_i]
    x <- x[x$gwas_group == paste0('EUR+', disc_pop_i)]
    x <- x[x$UKB_GWAS_N == paste0(n_i,'k')]
  }))
  
  # Average res_evalults for each test across phenotypes
  # Use MAd to account for correlation between them
  res_eval_i$Sample<-'A'

  for(group_i in unique(res_eval_i$Group)){
    res_eval_group_i <- res_eval_i[res_eval_i$Group == group_i,]
    missing_pheno <-
      colnames(cors[[targ_pop_i]])[!(colnames(cors[[targ_pop_i]]) %in% unique(res_eval_group_i$pheno))]
    
    if (!all(colnames(cors[[targ_pop_i]]) %in% unique(res_eval_group_i$pheno))) {
      print(paste0(
        'res_evalults missing for ',
        targ_pop_i,
        ' ',
        group_i,
        ' ',
        paste0(missing_pheno, collapse = ' ')
      ))
    }
    
    cors_i <- cors[[targ_pop_i]][unique(res_eval_group_i$pheno), unique(res_eval_group_i$pheno)]
    
    meta_res_eval_i <-
      agg(
        id = Sample,
        es = R,
        var = SE ^ 2,
        cor = cors_i,
        method = "BHHR",
        mod = NULL,
        data = res_eval_group_i
      )
    
    tmp <- data.table(Group = group_i,
                      Method = res_eval_group_i$Method[1],
                      Model = res_eval_group_i$Model[1],
                      Source = res_eval_group_i$Source[1],
                      Discovery = res_eval_group_i$Discovery[1],
                      gwas_group = res_eval_group_i$gwas_group[1],
                      UKB_GWAS_N = paste0(n_i,'k'),
                      Target = targ_pop_i,
                      R = meta_res_eval_i$es,
                      SE = sqrt(meta_res_eval_i$var))
    
    meta_res_eval <- rbind(meta_res_eval, tmp)
  }
}

meta_res_eval$Model<-factor(meta_res_eval$Model, levels=c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
meta_res_eval$Discovery<-factor(meta_res_eval$Discovery, levels=c('AFR','EAS','EUR','EUR+AFR','EUR+EAS'))
meta_res_eval$UKB_GWAS_N<-factor(meta_res_eval$UKB_GWAS_N, levels=unique(meta_res_eval$UKB_GWAS_N))

write.csv(meta_res_eval, '~/oliverpainfel/Analyses/crosspop/subsampled/r_eval.csv', row.names = F)

# Plot average performance across phenotypes for AFR and EAS targets
tmp <- meta_res_eval
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'] <- paste0(tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'], '-multi')
tmp$label <- factor(tmp$label, levels = model_order)
tmp$Discovery_clean <- as.character(tmp$Discovery)
tmp$Discovery_clean[tmp$Discovery == 'EUR'] <- 'EUR GWAS'
tmp$Discovery_clean[tmp$Discovery != 'EUR' & tmp$Source == 'Single'] <- 'EAS GWAS'
tmp$Discovery_clean[tmp$Discovery != 'EUR' & tmp$Source == 'Multi'] <- 'Both'
tmp$Discovery_clean <- factor(tmp$Discovery_clean, 
                              levels = c('EUR GWAS',
                                         'EAS GWAS',
                                         'Both'))
tmp$Target <- paste0(tmp$Target, ' Target')
tmp$Model[tmp$Model != 'SumStatTune'] <- 'IndivTune'
tmp$Model[tmp$Model == 'SumStatTune'] <- 'SumStatTune'
tmp <- tmp[!duplicated(tmp[, c('label','Target','Discovery_clean','Model','UKB_GWAS_N'), with=F]),]

png(paste0('~/oliverpainfel/Analyses/crosspop/subsampled/plots/average_r.png'), res=300, width = 3500, height = 1200, units = 'px')
ggplot(tmp, aes(x=label, y=R , fill = Model)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x='Method') +
    facet_grid(. ~ UKB_GWAS_N + Discovery_clean, scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
          legend.position = "top",
          legend.key.spacing.x = unit(1, "cm"),
          legend.justification = "center")
dev.off()

# Plot performance of -multi models trained using LEOPARD vs using indiv-level data
tmp <- meta_res_eval
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method')
tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods)] <- paste0(tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods)], '-multi')
tmp$label <- factor(tmp$label, levels = unique(tmp$label[order(!(grepl('Multi', tmp$label)), tmp$label)]))
tmp<-tmp[grepl('multi', tmp$label),]
tmp <- tmp[tmp$Model != 'Multi-IndivTune',]
tmp$Model<-as.character(tmp$Model)
tmp$Model[tmp$Model != 'SumStatTune']<-'IndivTune'
tmp$Model[tmp$Model == 'SumStatTune']<-'LEOPARD'
tmp$Target <- paste0(tmp$Target, ' Target')

png(paste0('~/oliverpainfel/Analyses/crosspop/subsampled/plots/average_r_leopard.png'), res=300, width = 1500, height = 1200, units = 'px')
ggplot(tmp, aes(x=label, y=R , fill = Model)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    labs(y = "R (SE)", x='Method') +
    facet_grid( ~ UKB_GWAS_N, scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
          legend.position = "top",
          legend.key.spacing.x = unit(1, "cm"),
          legend.justification = "center")
dev.off()

####
# Create heatmap showing difference between all methods and models
####

# Create a function to mirror pred_comp results
mirror_comp<-function(x){
  x_sym <- x
  x_sym$Model_1 <- x$Model_2
  x_sym$Model_2 <- x$Model_1
  x_sym$Model_1_R <- x$Model_2_R
  x_sym$Model_2_R <- x$Model_1_R
  x_sym$R_diff <- -x_sym$R_diff
  x_mirrored <- rbind(x, x_sym)
  x_diag<-data.frame(
      Model_1=unique(x_mirrored$Model_1),
      Model_2=unique(x_mirrored$Model_1),
      Model_1_R=x_mirrored$Model_1_R,
      Model_2_R=x_mirrored$Model_1_R,
      R_diff=NA,
      R_diff_pval=NA
    )
  x_comp<-rbind(x_mirrored, x_diag)
  return(x_comp)
}
  
# Read in results
res_comp <- list()
for(pheno_i in selected_traits){
  res_comp_i<-NULL
  for (n_i in c(5, 15, 45, 135)) {
    comp_i <-
      fread(
        paste0(
          '/users/k1806347/oliverpainfel/Analyses/crosspop/subsampled/',
          'targ_',
          targ_pop_i,
          '.disc_EUR_',
          disc_pop_i,
          '/',
          pheno_i,
          '/res_n', n_i, '.pred_comp.txt'
        )
      )
    comp_i<-mirror_comp(comp_i)
    comp_i$Target<-targ_pop_i
    comp_i$gwas_group<-paste0('EUR+', disc_pop_i)
    comp_i$UKB_GWAS_N<-paste0(n_i,'k')
    res_comp_i<-rbind(res_comp_i, comp_i)
  }
  
  res_comp[[pheno_i]]<-res_comp_i
}

res_comp_all <- do.call(rbind, lapply(names(res_comp), function(name) {
  x <- res_comp[[name]]
  x$pheno <- name  # Add a new column with the name of the element
  x  # Return the updated dataframe
}))

# Annotate tests to get order correct
res_comp_all$Method1<-sub('\\..*','',res_comp_all$Model_1)
res_comp_all$Method1<-gsub('-.*','', res_comp_all$Method1)
res_comp_all$Method2<-sub('\\..*','',res_comp_all$Model_2)
res_comp_all$Method2<-gsub('-.*','', res_comp_all$Method2)

find_model<-function(x){
  mod <- x
  mod[grepl('top1$', x) & !grepl('pseudo', x)] <- 'IndivTune'
  mod[grepl('top1$', x) & grepl('pseudo', x)] <- 'SumStatTune'
  mod[grepl('multi$', x) & !grepl('pseudo', x)] <- 'Multi-IndivTune'
  mod[grepl('multi$', x) & grepl('pseudo', x)] <- 'Multi-SumStatTune'
  mod[grepl('_multi', x)] <- 'SumStatTune'
  mod[x == 'prscsx.pseudo.multi'] <- 'SumStatTune'
  mod[x == 'xwing.pseudo.multi'] <- 'SumStatTune'
  
  return(mod)
}

res_comp_all$Model1<-find_model(res_comp_all$Model_1)
res_comp_all$Model2<-find_model(res_comp_all$Model_2)

res_comp_all$Source1<-ifelse(res_comp_all$Method1 %in% pgs_group_methods | grepl('_multi$', res_comp_all$Method1) | !grepl('AFR|EAS|EUR', res_comp_all$Model_1), 'Multi', 'Single')
res_comp_all$Source2<-ifelse(res_comp_all$Method2 %in% pgs_group_methods | grepl('_multi$', res_comp_all$Method2) | !grepl('AFR|EAS|EUR', res_comp_all$Model_2), 'Multi', 'Single')
  
for(i in c('EUR','EAS','AFR')){
  res_comp_all$Discovery1[grepl(i, res_comp_all$Model_1)] <- i
  res_comp_all$Discovery2[grepl(i, res_comp_all$Model_2)] <- i
}
res_comp_all$Discovery1[res_comp_all$Source1 == 'Multi'] <- res_comp_all$gwas_group[res_comp_all$Source1 == 'Multi']
res_comp_all$Discovery2[res_comp_all$Source2 == 'Multi'] <- res_comp_all$gwas_group[res_comp_all$Source2 == 'Multi']

res_comp_all$Method1<-factor(res_comp_all$Method1, levels=unique(res_comp_all$Method1))
res_comp_all$Method2<-factor(res_comp_all$Method2, levels=unique(res_comp_all$Method2))
res_comp_all$Model1<-factor(res_comp_all$Model1, levels=c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
res_comp_all$Model2<-factor(res_comp_all$Model2, levels=c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
res_comp_all$Discovery1<-factor(res_comp_all$Discovery1, levels=rev(c('AFR','EAS','EUR','EUR+AFR','EUR+EAS')))
res_comp_all$Discovery2<-factor(res_comp_all$Discovery2, levels=c('AFR','EAS','EUR','EUR+AFR','EUR+EAS'))

# Remove IndivTune and Multi-IndivTune model for groups that contain one score (aka QuickPRS and SBayesRC)
res_comp_all <- res_comp_all[
!(res_comp_all$Method1 %in%  c('quickprs','sbayesrc') & 
  res_comp_all$Model1 %in% c('IndivTune','Multi-IndivTune')),]
res_comp_all <- res_comp_all[
!(res_comp_all$Method2 %in%  c('quickprs','sbayesrc') & 
  res_comp_all$Model2 %in% c('IndivTune','Multi-IndivTune')),]

# Remove pseudo model for methods that don't really have one 
res_comp_all <- res_comp_all[
!(res_comp_all$Method1 %in%  c('ptclump','ptclump_multi') & 
  res_comp_all$Model1 %in% c('SumStatTune','Multi-SumStatTune')),]
res_comp_all <- res_comp_all[
!(res_comp_all$Method2 %in%  c('ptclump','ptclump_multi') & 
  res_comp_all$Model2 %in% c('SumStatTune','Multi-SumStatTune')),]

# Remove top1 models for PRS-CSx
res_comp_all <- res_comp_all[
!(grepl('prscsx|xwing|_multi', res_comp_all$Method1) & 
  grepl('top1', res_comp_all$Model_1)),]
res_comp_all <- res_comp_all[
!(grepl('prscsx|xwing|_multi', res_comp_all$Method2) & 
  grepl('top1', res_comp_all$Model_2)),]

# Remove any comparisons
res_comp_all <- res_comp_all[!duplicated(res_comp_all[, c("Target", "gwas_group", "Method1", "Model1", "Source1", "Discovery1", "Method2", "Model2", "Source2", "Discovery2",'pheno','UKB_GWAS_N')]),]

res_comp_all$r_diff_rel <- res_comp_all$R_diff / res_comp_all$Model_2_R

#####
# Export a csv containing difference results for all traits
#####
# Simplify to contain only IndivTune or SumStatTune result
tmp <- res_comp_all
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method1', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
names(tmp)[names(tmp) == 'label'] <- 'label1'
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method2', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
names(tmp)[names(tmp) == 'label'] <- 'label2'

tmp$label1[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label1 != 'All'] <- paste0(tmp$label1[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label1 != 'All'], '-multi')
tmp$label2[grepl('Multi', tmp$Model2) & !(tmp$Method2 %in% pgs_group_methods) & tmp$label2 != 'All'] <- paste0(tmp$label2[grepl('Multi', tmp$Model2) & !(tmp$Method2 %in% pgs_group_methods) & tmp$label2 != 'All'], '-multi')

tmp$Model1[tmp$Model1 != 'SumStatTune'] <- 'IndivTune'
tmp$Model1[tmp$Model1 == 'SumStatTune'] <- 'SumStatTune'
tmp$Model2[tmp$Model2 != 'SumStatTune'] <- 'IndivTune'
tmp$Model2[tmp$Model2 == 'SumStatTune'] <- 'SumStatTune'

tmp<-tmp[tmp$Model_1 %in% res_eval_simp$Group,]
tmp<-tmp[tmp$Model_2 %in% res_eval_simp$Group,]

tmp$`Model 1` <- paste0(tmp$label1, ' - ', tmp$Model1, ' - ', tmp$Discovery1)
tmp$`Model 2` <- paste0(tmp$label2, ' - ', tmp$Model2, ' - ', tmp$Discovery2)

tmp <- tmp[, c('Target', 'pheno', 'Model 1', 'Model 2', 'Model_1_R', 'Model_2_R', 'R_diff', 'R_diff_pval'), with=F]
names(tmp) <- c('Target', 'Trait','Model 1', 'Model 2', "R (Model 1)", "R (Model 2)", "R difference (Model 1 R - Model 2 R)", "R difference p-value")

tmp<-tmp[order(tmp$Target, tmp$Trait, tmp$`Model 1`, tmp$`Model 2`),]
tmp$`R difference (Model 1 R - Model 2 R)` <- round(tmp$`R difference (Model 1 R - Model 2 R)`, 3)
tmp$`R (Model 1)` <- round(tmp$`R (Model 1)`, 3)
tmp$`R (Model 2)` <- round(tmp$`R (Model 2)`, 3)

write.csv(tmp, '~/oliverpainfel/Analyses/crosspop/subsampled/r_diff.csv', row.names=F)

###########

library(MAd)

# Average R across phenotypes
meta_res_comp <- NULL
for (n_i in c(5, 15, 45, 135)) {
  # Subset res_comp for each scenario
  res_comp_i <- res_comp_all[res_comp_all$Target == targ_pop_i & res_comp_all$gwas_group == paste0('EUR+', disc_pop_i)]
  res_comp_i <- res_comp_i[res_comp_i$UKB_GWAS_N == paste0(n_i,'k'), ]
  
  # Calculate diff SE based on p-value
  res_comp_i$R_diff_pval[res_comp_i$R_diff == 0] <- 1-0.001
  res_comp_i$R_diff_pval[res_comp_i$R_diff_pval == 1]<-1-0.001
  res_comp_i$R_diff_z<-qnorm(res_comp_i$R_diff_pval/2)
  res_comp_i$R_diff_SE<-abs(res_comp_i$R_diff/res_comp_i$R_diff_z)
      
  # Average results for each test across phenotypes
  # Use MAd to account for correlation between them
  res_comp_i$Sample<-'A'
  res_comp_i$Group <- paste0(res_comp_i$Model_1, '_vs_', res_comp_i$Model_2)

  for(group_i in unique(res_comp_i$Group)){
    res_comp_group_i <- res_comp_i[res_comp_i$Group == group_i,]
    cors_i <- cors[[targ_pop_i]][unique(res_comp_group_i$pheno), unique(res_comp_group_i$pheno)]
    
    if(res_comp_group_i$Model_1[1] != res_comp_group_i$Model_2[1]){
      
      meta_res_comp_i <-
        agg(
          id = Sample,
          es = R_diff,
          var = R_diff_SE ^ 2,
          cor = cors_i,
          method = "BHHR",
          mod = NULL,
          data = res_comp_group_i
        )
      
      tmp <- res_comp_group_i[1,]
      tmp$pheno <- NULL
      tmp$Model_1_R <-
        meta_res_eval$R[meta_res_eval$Group == tmp$Model_1 &
                          meta_res_eval$Target == targ_pop_i &
                          meta_res_eval$gwas_group == paste0('EUR+', disc_pop_i) & 
                          meta_res_eval$UKB_GWAS_N == paste0(n_i,'k')]
      tmp$Model_2_R <-
        meta_res_eval$R[meta_res_eval$Group == tmp$Model_2 &
                          meta_res_eval$Target == targ_pop_i &
                          meta_res_eval$gwas_group == paste0('EUR+', disc_pop_i) & 
                          meta_res_eval$UKB_GWAS_N == paste0(n_i,'k')]
      tmp$R_diff <- meta_res_comp_i$es
      tmp$R_diff_SE <- sqrt(meta_res_comp_i$var)
      tmp$R_diff_z <- tmp$R_diff / tmp$R_diff_SE
      tmp$R_diff_p <- 2*pnorm(-abs(tmp$R_diff_z))
    } else {
      tmp <- res_comp_group_i[1,]
      tmp$pheno <- NULL
      tmp$R_diff <- NA
      tmp$R_diff_SE <- NA
      tmp$R_diff_z <- NA
      tmp$R_diff_p <- NA
    }
    meta_res_comp <- rbind(meta_res_comp, tmp)
  }
}

meta_res_comp$R_diff_perc <- meta_res_comp$R_diff / meta_res_comp$Model_2_R
  
# Compare QuickPRS-Multi vs QuickPRS to evaluate LEOPARD performance
tmp_quickprs <- meta_res_comp[meta_res_comp$Model_1 == 'quickprs_multi.pseudo.multi' & 
                                meta_res_comp$Model_2 == 'quickprs.pseudo.multi' &
                    meta_res_comp$Target == 'EUR',]

tmp_quickprs[,c('UKB_GWAS_N', 'R_diff_perc', 'R_diff_p'), with = F]

#####
# Export a csv containing difference results for all traits
#####
# Simplify to contain only IndivTune or SumStatTune result
tmp <- meta_res_comp
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method1', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
names(tmp)[names(tmp) == 'label'] <- 'label1'
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method2', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
names(tmp)[names(tmp) == 'label'] <- 'label2'

tmp$label1[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label1 != 'All'] <- paste0(tmp$label1[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label1 != 'All'], '-multi')
tmp$label2[grepl('Multi', tmp$Model2) & !(tmp$Method2 %in% pgs_group_methods) & tmp$label2 != 'All'] <- paste0(tmp$label2[grepl('Multi', tmp$Model2) & !(tmp$Method2 %in% pgs_group_methods) & tmp$label2 != 'All'], '-multi')

tmp$Model1[tmp$Model1 != 'SumStatTune'] <- 'IndivTune'
tmp$Model1[tmp$Model1 == 'SumStatTune'] <- 'SumStatTune'
tmp$Model2[tmp$Model2 != 'SumStatTune'] <- 'IndivTune'
tmp$Model2[tmp$Model2 == 'SumStatTune'] <- 'SumStatTune'

tmp<-tmp[tmp$Model_1 %in% res_eval_simp$Group,]
tmp<-tmp[tmp$Model_2 %in% res_eval_simp$Group,]

tmp$`Model 1` <- paste0(tmp$label1, ' - ', tmp$Model1, ' - ', tmp$Discovery1)
tmp$`Model 2` <- paste0(tmp$label2, ' - ', tmp$Model2, ' - ', tmp$Discovery2)

tmp$`Percentage change (R difference / Model 2 R)` <- paste0(round(tmp$R_diff_perc * 100, 1), '%')

tmp <- tmp[, c('UKB_GWAS_N', 'Target', 'Model 1', 'Model 2', 'Model_1_R', 'Model_2_R', 'R_diff',"Percentage change (R difference / Model 2 R)", 'R_diff_p'), with=F]
names(tmp) <- c('UKB GWAS N', 'Target','Model 1', 'Model 2', "R (Model 1)", "R (Model 2)", "R difference (Model 1 R - Model 2 R)", "Percentage change (R difference / Model 2 R)", "R difference p-value")

tmp<-tmp[order(tmp$Target, tmp$`Model 1`, tmp$`Model 2`),]
tmp$`R difference (Model 1 R - Model 2 R)` <- round(tmp$`R difference (Model 1 R - Model 2 R)`, 3)
tmp$`R (Model 1)` <- round(tmp$`R (Model 1)`, 3)
tmp$`R (Model 2)` <- round(tmp$`R (Model 2)`, 3)

write.csv(tmp, '~/oliverpainfel/Analyses/crosspop/subsampled/r_diff_average.csv', row.names=F)

####
# Plot relative improvement of methods
####
# Use the QuickPRS-Multi (IndivTune) as the reference for each UKB_GWAS_N

meta_res_comp_ptclump_top1<-meta_res_comp[meta_res_comp$Method2 == 'quickprs' & meta_res_comp$Source2 == 'Multi' & meta_res_comp$Model2 == 'Multi-SumStatTune',]
meta_res_comp_ptclump_top1$reference_point<-F
meta_res_comp_ptclump_top1$reference_point[meta_res_comp_ptclump_top1$Method1 == 'quickprs' & meta_res_comp_ptclump_top1$Source1 == 'Multi' & meta_res_comp_ptclump_top1$Model1 == 'Multi-SumStatTune']<-T
meta_res_comp_ptclump_top1$R_diff[is.na(meta_res_comp_ptclump_top1$R_diff)]<-0
meta_res_comp_ptclump_top1$Discovery1 <- factor(meta_res_comp_ptclump_top1$Discovery1, levels=rev(levels(meta_res_comp_ptclump_top1$Discovery1)))

res_comp_all_ptclump_top1<-res_comp_all[res_comp_all$Method2 == 'quickprs' & res_comp_all$Source2 == 'Multi' & res_comp_all$Model2 == 'Multi-SumStatTune',]
res_comp_all_ptclump_top1$Discovery1 <-  factor(res_comp_all_ptclump_top1$Discovery1, levels=levels(meta_res_comp_ptclump_top1$Discovery1))

# Create data to plot reference points
meta_res_comp_reference <- meta_res_comp_ptclump_top1
meta_res_comp_reference$R_diff[meta_res_comp_ptclump_top1$reference_point == F] <- NA
meta_res_comp_reference$R_diff_SE [meta_res_comp_ptclump_top1$reference_point == F] <- NA
res_comp_all_ptclump_top1$reference_point<-F

meta_tmp <- meta_res_comp_ptclump_top1
meta_tmp <- merge(meta_tmp, pgs_method_labels, by.x = 'Method1', by.y = 'method', all.x = T)
meta_tmp$label[is.na(meta_tmp$label)] <- 'All'
meta_tmp$label[grepl('Multi', meta_tmp$Model1) & !(meta_tmp$Method1 %in% pgs_group_methods) & meta_tmp$label != 'All'] <- paste0(meta_tmp$label[grepl('Multi', meta_tmp$Model1) & !(meta_tmp$Method1 %in% pgs_group_methods) & meta_tmp$label != 'All'], '-multi')
meta_tmp$label <- factor(meta_tmp$label, levels = model_order)
meta_tmp$Discovery_clean <- as.character(meta_tmp$Discovery1)
meta_tmp$Discovery_clean[meta_tmp$Discovery1 == 'EUR'] <- 'EUR GWAS'
meta_tmp$Discovery_clean[meta_tmp$Discovery1 != 'EUR' & meta_tmp$Source1 == 'Single'] <- 'EAS GWAS'
meta_tmp$Discovery_clean[meta_tmp$Discovery1 != 'EUR' & meta_tmp$Source1 == 'Multi'] <- 'Both'
meta_tmp$Discovery_clean <- factor(meta_tmp$Discovery_clean, 
                              levels = c('EUR GWAS',
                                         'EAS GWAS',
                                         'Both'))
meta_tmp$Target <- paste0(meta_tmp$Target, ' Target')
meta_tmp$Model1 <- factor(meta_tmp$Model1, levels = c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
meta_tmp$UKB_GWAS_N <- factor(meta_tmp$UKB_GWAS_N, levels = unique(meta_tmp$UKB_GWAS_N))

meta_tmp_ref <- meta_res_comp_reference
meta_tmp_ref <- merge(meta_tmp_ref, pgs_method_labels, by.x = 'Method1', by.y = 'method', all.x = T)
meta_tmp_ref$label[is.na(meta_tmp_ref$label)] <- 'All'
meta_tmp_ref$label[grepl('Multi', meta_tmp_ref$Model1) & !(meta_tmp_ref$Method1 %in% pgs_group_methods) & meta_tmp_ref$label != 'All'] <- paste0(meta_tmp_ref$label[grepl('Multi', meta_tmp_ref$Model1) & !(meta_tmp_ref$Method1 %in% pgs_group_methods) & meta_tmp_ref$label != 'All'], '-multi')
meta_tmp_ref$label <- factor(meta_tmp_ref$label, levels = model_order)
meta_tmp_ref$Discovery_clean <- as.character(meta_tmp_ref$Discovery1)
meta_tmp_ref$Discovery_clean[meta_tmp_ref$Discovery1 == 'EUR'] <- 'EUR GWAS'
meta_tmp_ref$Discovery_clean[meta_tmp_ref$Discovery1 != 'EUR' & meta_tmp_ref$Source1 == 'Single'] <- 'EAS GWAS'
meta_tmp_ref$Discovery_clean[meta_tmp_ref$Discovery1 != 'EUR' & meta_tmp_ref$Source1 == 'Multi'] <- 'Both'
meta_tmp_ref$Discovery_clean <- factor(meta_tmp_ref$Discovery_clean, 
                              levels = c('EUR GWAS',
                                         'EAS GWAS',
                                         'Both'))
meta_tmp_ref$Target <- paste0(meta_tmp_ref$Target, ' Target')
meta_tmp_ref$Model1 <- factor(meta_tmp_ref$Model1, levels = c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
meta_tmp_ref$UKB_GWAS_N <- factor(meta_tmp_ref$UKB_GWAS_N, levels = unique(meta_tmp_ref$UKB_GWAS_N))

tmp <- res_comp_all_ptclump_top1
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method1', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
tmp$label[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label != 'All'] <- paste0(tmp$label[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label != 'All'], '-multi')
tmp$label <- factor(tmp$label, levels = model_order)
tmp$Discovery_clean <- as.character(tmp$Discovery1)
tmp$Discovery_clean[tmp$Discovery1 == 'EUR'] <- 'EUR GWAS'
tmp$Discovery_clean[tmp$Discovery1 != 'EUR' & tmp$Source1 == 'Single'] <- 'EAS GWAS'
tmp$Discovery_clean[tmp$Discovery1 != 'EUR' & tmp$Source1 == 'Multi'] <- 'Both'
tmp$Discovery_clean <- factor(tmp$Discovery_clean, 
                              levels = c('EUR GWAS',
                                         'EAS GWAS',
                                         'Both'))
tmp$Target <- paste0(tmp$Target, ' Target')
tmp$Model1 <- factor(tmp$Model1, levels = c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
tmp$UKB_GWAS_N <- factor(tmp$UKB_GWAS_N, levels = unique(tmp$UKB_GWAS_N))

ggplot(meta_tmp, aes(x=label, y=R_diff , fill = Model1)) +
    geom_point(
        data = tmp,
        mapping = aes(x=label, y=R_diff, colour=Model1),
        position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.7),
        alpha = 0.3
      ) +
    geom_hline(yintercept = 0) +
      geom_errorbar(
        aes(
          ymin = R_diff - R_diff_SE,
          ymax = R_diff + R_diff_SE
        ),
        width = 0,
        position = position_dodge(width = 0.7)
      ) +    
    geom_point(
      stat = "identity",
      position = position_dodge(0.7),
      size = 3,
      shape = 23
    ) +
    geom_point(
        data = meta_tmp_ref,
        aes(x = label, y = R_diff, fill = Model1),
        stat = "identity",
        position = position_dodge(0.7), # Ensure same dodge as other points
        size = 3,    # Increase size for emphasis
        shape = 22,
        stroke = 1.5,
        show.legend=F
    ) +
    geom_vline(xintercept = seq(1.5, length(unique(meta_tmp$label))), linetype="dotted") +
    labs(y = "R_diff (SE)") +
    facet_grid(UKB_GWAS_N ~ Discovery_clean, scales = 'free_x', space = 'free_x') +
    theme_half_open() +
    background_grid() + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))


# Plot as % change
meta_tmp$R_diff_perc <- meta_tmp$R_diff / meta_tmp$Model_2_R
meta_tmp_ref$R_diff_perc <- meta_tmp_ref$R_diff / meta_tmp_ref$Model_2_R
tmp$R_diff_perc <- tmp$R_diff / tmp$Model_2_R

meta_tmp$R_diff_perc_SE <- meta_tmp$R_diff_SE / meta_tmp$Model_2_R

library(scales)
ggplot(meta_tmp, aes(x=label, y=R_diff_perc , fill = Model1)) +
    geom_point(
        data = tmp,
        mapping = aes(x=label, y=R_diff_perc, colour=Model1),
        position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.7),
        alpha = 0.3
      ) +
    geom_hline(yintercept = 0) +
      geom_errorbar(
        aes(
          ymin = R_diff_perc - R_diff_perc_SE,
          ymax = R_diff_perc + R_diff_perc_SE
        ),
        width = 0,
        position = position_dodge(width = 0.7)
      ) +    
    geom_point(
      stat = "identity",
      position = position_dodge(0.7),
      size = 3,
      shape = 23
    ) +
    geom_point(
        data = meta_tmp_ref,
        aes(x = label, y = R_diff_perc, fill = Model1),
        stat = "identity",
        position = position_dodge(0.7), # Ensure same dodge as other points
        size = 3,    # Increase size for emphasis
        shape = 22,
        stroke = 1.5,
        show.legend=F
    ) +
    scale_y_continuous(labels = percent_format()) +
    geom_vline(xintercept = seq(1.5, length(unique(meta_tmp$label))), linetype="dotted") +
    labs(y = "R diff. (SE)") +
    facet_grid(UKB_GWAS_N ~ Discovery_clean, scales = 'free_x', space = 'free_x') +
    theme_half_open() +
    background_grid() + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))

# Simplify results showing results only with or without training data
meta_tmp_simple <- meta_tmp
meta_tmp_simple$Model1[meta_tmp_simple$Model1 != 'SumStatTune'] <- 'IndivTune'
meta_tmp_simple$Model1[meta_tmp_simple$Model1 == 'SumStatTune'] <- 'SumStatTune'
meta_tmp_simple$Model2[meta_tmp_simple$Model2 != 'SumStatTune'] <- 'IndivTune'
meta_tmp_simple$Model2[meta_tmp_simple$Model2 == 'SumStatTune'] <- 'SumStatTune'
meta_tmp_simple<-meta_tmp_simple[meta_tmp_simple$Model_1 %in% res_eval_simp$Group,]
meta_tmp_simple<-meta_tmp_simple[meta_tmp_simple$Model_2 %in% res_eval_simp$Group,]

meta_tmp_ref_simple <- meta_tmp_ref
meta_tmp_ref_simple$Model1[meta_tmp_ref_simple$Model1 != 'SumStatTune'] <- 'IndivTune'
meta_tmp_ref_simple$Model1[meta_tmp_ref_simple$Model1 == 'SumStatTune'] <- 'SumStatTune'
meta_tmp_ref_simple$Model2[meta_tmp_ref_simple$Model2 != 'SumStatTune'] <- 'IndivTune'
meta_tmp_ref_simple$Model2[meta_tmp_ref_simple$Model2 == 'SumStatTune'] <- 'SumStatTune'
meta_tmp_ref_simple<-meta_tmp_ref_simple[meta_tmp_ref_simple$Model_1 %in% res_eval_simp$Group,]
meta_tmp_ref_simple<-meta_tmp_ref_simple[meta_tmp_ref_simple$Model_2 %in% res_eval_simp$Group,]

tmp_simple <- tmp
tmp_simple$Model1[tmp_simple$Model1 != 'SumStatTune'] <- 'IndivTune'
tmp_simple$Model1[tmp_simple$Model1 == 'SumStatTune'] <- 'SumStatTune'
tmp_simple$Model2[tmp_simple$Model2 != 'SumStatTune'] <- 'IndivTune'
tmp_simple$Model2[tmp_simple$Model2 == 'SumStatTune'] <- 'SumStatTune'
tmp_simple<-tmp_simple[tmp_simple$Model_1 %in% res_eval_simp$Group,]
tmp_simple<-tmp_simple[tmp_simple$Model_2 %in% res_eval_simp$Group,]

# Export plot for manuscript
png('~/oliverpainfel/Analyses/crosspop/subsampled/plots/average_r.perc_improv.png', width = 3200, height = 2000, res= 300, units = 'px')
ggplot(meta_tmp_simple[meta_tmp_simple$Target == 'EUR Target',], aes(x=label, y=R_diff_perc , fill = Model1)) +
#    geom_boxplot(
#      data = tmp_simple[tmp_simple$Target != 'EUR Target',],
#        mapping = aes(x=label, y=R_diff_perc, colour=Model1),
#        position = position_dodge(0.7),
#        alpha = 0.3
#      ) +
    geom_point(
        data = tmp_simple[tmp_simple$Target != 'EUR Target',],
        mapping = aes(x=label, y=R_diff_perc, colour=Model1),
        position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.7),
        alpha = 0.3
      ) +
    geom_hline(yintercept = 0) +
      geom_errorbar(
        aes(
          ymin = R_diff_perc - R_diff_perc_SE,
          ymax = R_diff_perc + R_diff_perc_SE
        ),
        width = 0,
        position = position_dodge(width = 0.7)
      ) +    
    geom_point(
      stat = "identity",
      position = position_dodge(0.7),
      size = 3,
      shape = 23
    ) +
    geom_point(
        data = meta_tmp_ref_simple[meta_tmp_ref_simple$Target != 'EUR Target',],
        aes(x = label, y = R_diff_perc, fill = Model1),
        stat = "identity",
        position = position_dodge(0.7), # Ensure same dodge as other points
        size = 4,
        shape = 22,
        show.legend=F
    ) +
    geom_vline(xintercept = seq(1.5, length(unique(meta_tmp_simple$label))), linetype="dotted") +
    scale_y_continuous(labels = percent_format()) +
    labs(y = "Relative Improvement (SE)", fill = NULL, colour = NULL, x = NULL) +
    facet_grid(UKB_GWAS_N ~ Discovery_clean, scales = 'free_x', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(
        axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),  # Increase x-axis labels
        legend.position = "top",
        legend.key.spacing.x = unit(2, "cm"),
        legend.justification = "center"
    )
dev.off()

# Export plot comparing sumstat vs indivtune for QuickPRS
# Export plot for manuscript
png('~/oliverpainfel/Analyses/crosspop/subsampled/plots/average_r.perc_improv.png', width = 1500, height = 1200, res= 300, units = 'px')
ggplot(meta_tmp_simple[meta_tmp_simple$Target == 'EUR Target' & meta_tmp_simple$Discovery_clean == 'Both',], aes(x=label, y=R_diff_perc , fill = Model1)) +
#    geom_boxplot(
#      data = tmp_simple[tmp_simple$Target != 'EUR Target',],
#        mapping = aes(x=label, y=R_diff_perc, colour=Model1),
#        position = position_dodge(0.7),
#        alpha = 0.3
#      ) +
    geom_point(
        data = tmp_simple[tmp_simple$Target != 'EUR Target',],
        mapping = aes(x=label, y=R_diff_perc, colour=Model1),
        position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.7),
        alpha = 0.3
      ) +
    geom_hline(yintercept = 0) +
      geom_errorbar(
        aes(
          ymin = R_diff_perc - R_diff_perc_SE,
          ymax = R_diff_perc + R_diff_perc_SE
        ),
        width = 0,
        position = position_dodge(width = 0.7)
      ) +    
    geom_point(
      stat = "identity",
      position = position_dodge(0.7),
      size = 3,
      shape = 23
    ) +
    geom_point(
        data = meta_tmp_ref_simple[meta_tmp_ref_simple$Target != 'EUR Target',],
        aes(x = label, y = R_diff_perc, fill = Model1),
        stat = "identity",
        position = position_dodge(0.7), # Ensure same dodge as other points
        size = 4,
        shape = 22,
        show.legend=F
    ) +
    scale_y_continuous(labels = percent_format()) +
    labs(y = "Relative Improvement (SE)", fill = NULL, colour = NULL, x = NULL) +
    facet_grid(. ~ UKB_GWAS_N, scales = 'free_x', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(
        axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),  # Increase x-axis labels
        legend.position = "top",
        legend.key.spacing.x = unit(2, "cm"),
        legend.justification = "center"
    )
dev.off()

Show results

LEOPARD+QuickPRS

Here we will compare the LEOPARD estimated weights for population specific PGS, to the weights estimated using observed data in the UKB target sample.

Show code

setwd('/users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline/')

library(data.table)
library(ggplot2)
library(cowplot)

source('../functions/misc.R')
source_all('../functions')

# Read in list of outcomes 
selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', header=F)$V1

# Get some key variables from config
config<-'/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_subsampled.yaml'
outdir <- read_param(config = config, param = 'outdir', return_obj = F)

# Get a list of score files
scores <- list_score_files(config)

###
# Read in weights estimated by LEOPARD (QuickPRS)
###

leopard_weights<-NULL
scores_quickprs <- scores$name[scores$method == 'quickprs_multi']
for(i in selected_traits){
  for(n in c('5','15','45','135')){
    scores_i <- scores_quickprs[grepl(paste0('^', i,'_'), scores_quickprs) & grepl(paste0('_', n,'K_'), scores_quickprs)]
    for(j in scores_i){
        weights_file <- readRDS(paste0(outdir, '/reference/pgs_score_files/leopard/', j, '/ref-', j, '.weights.rds'))
        weights_file <- data.frame(weights_file)
        
        weights <-
          data.table(
            Target = do.call(c, lapply(names(weights_file), function(x) rep(x, 2))),
            Discovery = names(weights_file),
            Weight = do.call(c, lapply(weights_file, function(x) x)),
            UKB_GWAS_N = paste0(n, 'k'),
            Trait = i,
            Method = 'LEOPARD'
          )
        
        leopard_weights <- rbind(leopard_weights, weights)
    }
  }
}

leopard_weights<-leopard_weights[leopard_weights$Target == 'EUR' & leopard_weights$Discovery == 'EUR',]

#####
# Read in the PGS weights estimated using UKB data
#####
# Read in the final model coefficients for multi-source methods

obs_weights<-NULL
for(method_i in unique(scores$method)[!(unique(scores$method) %in% pgs_group_methods)]){
  scores_method<-scores$name[scores$method == method_i]
  method_i <- gsub('_multi','', method_i)

  for(i in selected_traits){
    for(j in c('EUR')){
      if(j == 'EUR'){
        pops <- c('EAS')
      } else {
        pops <- j
      }
      
      for(k in pops){
          for(n in c('5','15','45','135')){
          model <- fread(paste0('~/oliverpainfel/Analyses/crosspop/subsampled/targ_', j, '.disc_EUR_', k, '/', i, '/res', n, '_final_models/', method_i, '.pseudo.multi.final_model.txt'))
          model<-model[-1,]
          
          # Set weight to zero if negative, as this is what LEOPARD does
          if(any(model$V2 < 0)){
            model$V2[model$V2 < 0] <- 0
            model$V2[model$V2 > 0] <- 1
          }
          
          names(model) <- c('x', 'BETA')
          model$Discovery[grepl('UKB', model$x)]<-'EUR'
          model$Discovery[grepl('BBJ', model$x)]<-'EAS'
          model$Discovery[grepl('UGR', model$x)]<-'AFR'
          model$UKB_GWAS_N<-paste0(n,'k')
          model$Target <- j
          model$Weight <- model$BETA/sum(model$BETA)
          model$Trait <- i
          model$Method <- method_i
          model<-model[,c('Target','Discovery','Weight','Method','UKB_GWAS_N','Trait'), with=F]
          obs_weights<-rbind(obs_weights, model)
        }
      }
    }
  }
}

obs_weights<-obs_weights[obs_weights$Target == 'EUR' & obs_weights$Discovery == 'EUR',]

###
# Estimate weights if using the inverse variance weighting
###

# Read in GWAS descriptives
gwas_desc<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/gwas_descriptives.csv')
gwas_desc <- gwas_desc[, c('Trait Label','Ancestry','GWAS N'), with=F]
names(gwas_desc)<-c('trait','ancestry','n')
gwas_desc<-gwas_desc[gwas_desc$trait %in% selected_traits,]

gwas_desc_eas <- gwas_desc[gwas_desc$ancestry == 'EAS',]
gwas_desc_eas$name<-'BBJ'

gwas_desc_eur <- data.frame(
  trait = gwas_desc_eas$trait,
  ancestry = 'EUR',
  n = c(rep(5000, 10), rep(15000, 10), rep(45000, 10), rep(135000, 10))
)

gwas_desc<-merge(gwas_desc_eas, gwas_desc_eur, by='trait')
gwas_desc$inverse_var <- gwas_desc$n.y / (gwas_desc$n.y + gwas_desc$n.x)

gwas_desc$Target <- 'EUR'
gwas_desc$Discovery <- 'EUR'
gwas_desc$Weight <- gwas_desc$inverse_var
gwas_desc$Method <- 'inverse_var'
gwas_desc$UKB_GWAS_N <- paste0(gwas_desc$n.y/1000,'k')
gwas_desc$Trait <- gwas_desc$trait

gwas_desc <- gwas_desc[, names(obs_weights), with=F]

###
# Combine and compare
###

both <- do.call(rbind, list(obs_weights, leopard_weights, gwas_desc))

both<-merge(both, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x=T, sort = F)
both$label[is.na(both$label)] <- both$Method[is.na(both$label)]
both$label <- factor(both$label, levels=unique(both$label))

# Plot EUR target
tmp <- both[both$Target == 'EUR',]
tmp <- tmp[tmp$Discovery == 'EUR',]

# Set LEOPARD to black fill
default_colors <- hue_pal()(10)
names(default_colors) <- levels(tmp$label)
default_colors["LEOPARD"] <- "black"

tmp$UKB_GWAS_N <- factor(tmp$UKB_GWAS_N, levels = unique(tmp$UKB_GWAS_N))

# Plot the estimated and observed weights
png('~/oliverpainfel/Analyses/crosspop/subsampled/plots/leopard_weights_eur.png', units = 'px', res = 300, width = 3500, height = 1500)
ggplot(tmp, aes(x = Trait, y = Weight, fill = label)) +
  geom_bar(width= 0.7, position=position_dodge(0.7), stat="identity", colour = 'black', size = 0.1) +
  scale_fill_manual(values = default_colors) +
  facet_grid(. ~ UKB_GWAS_N) +
  theme_half_open() +
  labs(title = 'Weight of EUR PGS for EUR Target', fill = NULL) +
  background_grid(major = 'y', minor = 'y') + 
  panel_border() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ylim(c(0,1))
dev.off()

###
# Check calibration of LEOPARD compared to QuickPRS observed weights
###

tmp <- both[both$Target == 'EUR',]
tmp <- tmp[both$Discovery == 'EUR',]
tmp$Target<-NULL
tmp_wide <- reshape(tmp, 
                     idvar = c("Trait", "Discovery","UKB_GWAS_N"), 
                     timevar = "label", 
                     direction = "wide")

names(tmp_wide) <- gsub('Weight.', '', names(tmp_wide))
tmp_wide<-tmp_wide[, !(grepl('Method', names(tmp_wide))), with = F]

# Calculate metrics
metrics <- NULL
for(n in c('5','15','45','135')){
  tmp_wide_n <- tmp_wide[tmp_wide$UKB_GWAS_N == paste0(n,'k'),]
  metrics<-rbind(metrics, 
                 data.frame(
                   n = paste0(n, 'k'),
                   rmse = sqrt(mean((tmp_wide_n$QuickPRS - tmp_wide_n$LEOPARD)^2)),
                   me = mean(tmp_wide_n$QuickPRS - tmp_wide_n$LEOPARD)
                 ))
}

# Create annotation data.frame
metrics_df <- data.frame(
  UKB_GWAS_N = metrics$n,
  x = 0.1,
  y = 0.15,
  label = paste0("RMSE = ", round(metrics$rmse, 2), "\nME = ", round(metrics$me, 2))
)

tmp_wide$UKB_GWAS_N <- factor(tmp_wide$UKB_GWAS_N, levels = unique(tmp_wide$UKB_GWAS_N))
metrics_df$UKB_GWAS_N <- factor(metrics_df$UKB_GWAS_N, levels = unique(tmp_wide$UKB_GWAS_N))

png('~/oliverpainfel/Analyses/crosspop/subsampled/plots/leopard_weights_calibration.png', units = 'px', width = 2000, height = 2000, res = 300)
ggplot(tmp_wide, aes(x = LEOPARD, y = QuickPRS)) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "grey40") +  # Perfect calibration
  geom_smooth(method = "lm", se = TRUE, colour = "blue") +  # Regression line
  geom_point(alpha = 0.7) +
  geom_text(data = metrics_df, aes(x = x, y = y, label = label), inherit.aes = FALSE, hjust = 0, size = 3.5) +
  labs(
    x = "LEOPARD weight",
    y = "Observed weight",
  ) +
  theme_half_open() +
  panel_border() + 
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
  ) +
  facet_grid(. ~ UKB_GWAS_N) +
  coord_fixed()    
dev.off()

###
# Check calibration of inverse_var compared to QuickPRS observed weights
###

tmp <- both[both$Target == 'EUR',]
tmp <- tmp[both$Discovery == 'EUR',]
tmp$Target<-NULL
tmp_wide <- reshape(tmp, 
                     idvar = c("Trait", "Discovery","UKB_GWAS_N"), 
                     timevar = "label", 
                     direction = "wide")

names(tmp_wide) <- gsub('Weight.', '', names(tmp_wide))
tmp_wide<-tmp_wide[, !(grepl('Method', names(tmp_wide))), with = F]

# Calculate metrics
metrics <- NULL
for(n in c('5','15','45','135')){
  tmp_wide_n <- tmp_wide[tmp_wide$UKB_GWAS_N == paste0(n,'k'),]
  metrics<-rbind(metrics, 
                 data.frame(
                   n = paste0(n, 'k'),
                   rmse = sqrt(mean((tmp_wide_n$QuickPRS - tmp_wide_n$inverse_var)^2)),
                   me = mean(tmp_wide_n$QuickPRS - tmp_wide_n$inverse_var)
                 ))
}

# Create annotation data.frame
metrics_df <- data.frame(
  UKB_GWAS_N = metrics$n,
  x = 0.3,
  y = 0.25,
  label = paste0("RMSE = ", round(metrics$rmse, 2), "\nME = ", round(metrics$me, 2))
)

tmp_wide$UKB_GWAS_N <- factor(tmp_wide$UKB_GWAS_N, levels = unique(tmp_wide$UKB_GWAS_N))
metrics_df$UKB_GWAS_N <- factor(metrics_df$UKB_GWAS_N, levels = unique(tmp_wide$UKB_GWAS_N))

png('~/oliverpainfel/Analyses/crosspop/subsampled/plots/leopard_weights_calibration_inverse_var.png', units = 'px', width = 2000, height = 2000, res = 300)
ggplot(tmp_wide, aes(x = inverse_var, y = QuickPRS)) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "grey40") +  # Perfect calibration
  geom_smooth(method = "lm", se = TRUE, colour = "blue") +  # Regression line
  geom_point(alpha = 0.7) +
  geom_text(data = metrics_df, aes(x = x, y = y, label = label), inherit.aes = FALSE, hjust = 0, size = 3.5) +
  labs(
    x = "inverse_var weight",
    y = "Observed weight",
  ) +
  theme_half_open() +
  panel_border() + 
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
  ) +
  facet_grid(. ~ UKB_GWAS_N) +
  coord_fixed()    
dev.off()

Using MVP sumstats

Download MVP sumstats

Show code

library(data.table)

mvp <- fread('~/oliverpainfel/Data/GWAS_sumstats/MVP/MVP_sumstats.txt')
mvp_afr <- mvp[grepl('Afr', mvp$discoverySampleAncestry),]
mvp_afr <- mvp_afr[!grepl('Eur|Asi|His', mvp_afr$discoverySampleAncestry),]

# Identify traits in common with UKB, UGR, EAS
prscsx_dat<-fread('/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/prscsx_data.csv')
selected_traits <-
  fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt',
        header = F)$V1

prscsx_dat <- prscsx_dat[prscsx_dat$labels %in% selected_traits, ]

# Subset MVP to selected traits
mvp_afr_subset <- mvp_afr[
  mvp_afr$accessionId %in% 
    c(
      'GCST90475361', 'GCST90475375', 'GCST90476298', 'GCST90475155', 'GCST90476462', 'GCST90475457', 'GCST90476423', 'GCST90475528', 'GCST90475351', 'GCST90476402'
    )
, ]

# Insert labels
mvp_afr_subset$labels <- NA
mvp_afr_subset$labels[mvp_afr_subset$efoTraits == 'body mass index'] <- 'BMI'
mvp_afr_subset$labels[mvp_afr_subset$efoTraits == 'body weight'] <- 'BWT'
mvp_afr_subset$labels[mvp_afr_subset$efoTraits == 'high density lipoprotein cholesterol measurement'] <- 'HDL'
mvp_afr_subset$labels[mvp_afr_subset$efoTraits == 'body height'] <- 'HT'
mvp_afr_subset$labels[mvp_afr_subset$efoTraits == 'hemoglobin measurement'] <- 'HB'
mvp_afr_subset$labels[mvp_afr_subset$efoTraits == 'mean corpuscular hemoglobin concentration'] <- 'MCHC'
mvp_afr_subset$labels[mvp_afr_subset$efoTraits == 'neutrophil count'] <- 'NEU'
mvp_afr_subset$labels[mvp_afr_subset$efoTraits == 'platelet count'] <- 'PLT'
mvp_afr_subset$labels[mvp_afr_subset$efoTraits == 'systolic blood pressure'] <- 'SBP'
mvp_afr_subset$labels[mvp_afr_subset$efoTraits == 'total cholesterol measurement'] <- 'TC'

mvp_afr_subset$url <-paste0(mvp_afr_subset$summaryStatistics, '/', mvp_afr_subset$accessionId, '.tsv.gz')

dir.create('~/oliverpainfel/Data/GWAS_sumstats/MVP/AFR')

write.table(
  mvp_afr_subset[, c('url', 'labels'), with = F],
  '~/oliverpainfel/Data/GWAS_sumstats/MVP/AFR/urls.txt',
  row.names = F,
  quote = F,
  col.names = F
)

write.table(
  mvp_afr_subset,
  '~/oliverpainfel/Data/GWAS_sumstats/MVP/AFR/info.txt',
  row.names = F,
  quote = T,
  col.names = T
)

write.table(mvp_afr_subset$labels, '/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/trait_subset.txt', col.names = F, row.names = F, quote = F)

for pheno in $(cat ~/oliverpainfel/Data/GWAS_sumstats/MVP/AFR/urls.txt | cut -d' ' -f 2); do
  url=$(awk -v var="$pheno" '$2 == var {print $1}' ~/oliverpainfel/Data/GWAS_sumstats/MVP/AFR/urls.txt)
  sbatch -p interruptible_cpu,neurohack_cpu -t 1:00:00 --wrap="wget -O ~/oliverpainfel/Data/GWAS_sumstats/MVP/AFR/${pheno}.txt.gz ${url}"
done

PGS calculation

Run leading single-source PGS methods using MVP GWAS sumstats.

Show code

Prepare configuration

library(data.table)

# Subset original gwas_list to include AFR traits
gwas_list<-fread('/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list.txt')
gwas_list <- gwas_list[gwas_list$population == 'AFR',]
selected_traits <- gsub('_UGR', '', gwas_list$name)
gwas_list$name <- gsub('UGR', 'MVP_AFR', gwas_list$name)
gwas_list$label <- gsub('UGR', 'MVP_AFR', gwas_list$label)
gwas_list$path <-
  paste0('/users/k1806347/oliverpainfel/Data/GWAS_sumstats/MVP/AFR/',
         selected_traits,'.txt.gz')

gwas_list_eur<-fread('/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list.txt')
gwas_list_eur<-gwas_list_eur[gwas_list_eur$population == 'EUR', ]

gwas_list <- rbind(gwas_list, gwas_list_eur)

gwas_list$label <- paste0('"', gwas_list$label, '"')

write.table(gwas_list, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_mvp.txt', col.names = T, row.names = F, quote = F)

######
# gwas_groups
######

gwas_groups<-data.frame(
  name=paste0(selected_traits, '_UKB_MVP_AFR'),
  gwas=sapply(selected_traits, function(x) paste0(x,'_UKB,',x,'_MVP_AFR')),
  label=paste0('"', selected_traits, " (UKB+MVP_AFR)", '"')
)

write.table(gwas_groups, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_groups_mvp.txt', col.names = T, row.names = F, quote = F)

######
# config
######

config<-c(
  "outdir: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/output",
  "config_file: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_mvp.yaml",
  "gwas_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_mvp.txt",
  "gwas_groups: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_groups_mvp.txt",
  "target_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/basic/target_list.txt",
  "pgs_methods: ['ptclump','quickprs','dbslmm','lassosum','megaprs','prscs','ldpred2','sbayesrc','prscsx','xwing']",
  "leopard_methods: ['ptclump','quickprs','dbslmm','lassosum','megaprs','prscs','ldpred2','sbayesrc']",
  "cores_prep_pgs: 10", # xwing run with 20 cores
  "cores_target_pgs: 50",
  "ldpred2_inference: F",
  "ldpred2_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/ldpred2/hm3",
  "quickprs_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/quickprs/hm3",
  "quickprs_multi_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/quickprs/hm3_subset",
  "sbayesrc_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/sbayesrc/hm3"
)

write.table(config, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_mvp.yaml', col.names = F, row.names = F, quote = F)

# Make a version of the gwas_list without NEU
gwas_list <- gwas_list[!grepl('NEU', gwas_list$name),]
write.table(gwas_list, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_mvp_noneu.txt', col.names = T, row.names = F, quote = F)

gwas_groups <- gwas_groups[!grepl('NEU', gwas_groups$name),]
write.table(gwas_groups, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_groups_mvp_noneu.txt', col.names = T, row.names = F, quote = F)

config<-c(
  "outdir: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/output",
  "config_file: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_mvp.yaml",
  "gwas_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_mvp_noneu.txt",
  "gwas_groups: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_groups_mvp_noneu.txt",
  "target_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/basic/target_list.txt",
  "pgs_methods: ['ptclump','quickprs','dbslmm','lassosum','megaprs','prscs','ldpred2','sbayesrc','prscsx','xwing']",
  "leopard_methods: ['ptclump','quickprs','dbslmm','lassosum','megaprs','prscs','ldpred2','sbayesrc']",
  "cores_prep_pgs: 10", # xwing run with 20 cores
  "cores_target_pgs: 50",
  "ldpred2_inference: F",
  "ldpred2_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/ldpred2/hm3",
  "quickprs_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/quickprs/hm3",
  "quickprs_multi_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/quickprs/hm3_subset",
  "sbayesrc_ldref: /users/k1806347/oliverpainfel/Data/hgdp_1kg/sbayesrc/hm3"
)

write.table(config, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_mvp.yaml', col.names = F, row.names = F, quote = F)

Run pipeline

snakemake \
  --profile slurm \
  --use-conda \
  --configfile=/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_mvp.yaml \
  target_pgs -n
  
# Note. xwing fails for PLT. Remove PLT from gwas_list to get scores for other traits.
# Save PLT results for other methods to check the pattern is similar.
snakemake \
  --profile slurm \
  --use-conda \
  --configfile=/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_mvp.yaml \
  target_pgs -n

PGS evaluation

Compare the single-source PGS within AFR ancestry target individuals.

Show code

Create predictor list

setwd('~/oliverpainfel/Software/MyGit/GenoPred/pipeline/')
source('../functions/misc.R')
source_all('../functions')
library(data.table)

# Get some key variables from config
config<-'/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_mvp.yaml'
pgs_methods <- read_param(config = config, param = 'pgs_methods', return_obj = F)
outdir <- read_param(config = config, param = 'outdir', return_obj = F)

# Read in list of outcomes 
selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/trait_subset.txt', header=F)$V1
selected_traits<-selected_traits[selected_traits != 'NEU']

# Get a list of score files
scores <- list_score_files(config)

# Create files for AFR targets
targ_pop <- c('AFR')
for(trait_i in selected_traits){
  scores_i <- scores[grepl(trait_i, scores$name),]
  scores_i$multi <- scores_i$method
  
  for(targ_pop_i in targ_pop){
    if(targ_pop_i == 'EAS'){
      disc_pop <- 'BBJ'
    }
    if(targ_pop_i == 'AFR'){
      disc_pop <- 'MVP_AFR'
    }
    if(targ_pop_i == 'EUR'){
      disc_pop <- c('BBJ','UGR')
    }
    
    for(disc_pop_j in disc_pop){
      if(disc_pop_j == 'BBJ'){
        disc_pop_j_2 <- 'EAS'
      }
      if(disc_pop_j == 'MVP_AFR'){
        disc_pop_j_2 <- 'AFR'
      }

      dir.create(
        paste0(
          '/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/targ_',
          targ_pop_i,
          '.disc_EUR_',
          disc_pop_j_2,
          '/',
          trait_i
        ),
        recursive = T
      )
      
      scores_i_j <- scores_i[
        (grepl('UKB$', scores_i$name, ignore.case = F) | 
         grepl(paste0(disc_pop_j, '$'), scores_i$name, ignore.case = T)),]

      # Insert path to score file
      scores_i_j$predictor <- paste0(
        outdir,
        '/ukb/pgs/TRANS/',
        scores_i_j$method,
        '/',
        scores_i_j$name,
        '/ukb-',
        scores_i_j$name,
        '-TRANS.profiles'
      )
      
      ####
      # Make groups single source methods
      ####
      
      scores_i_j_single_top1 <-
        scores_i_j[!(scores_i_j$method %in% pgs_group_methods) &
                     !grepl('_multi$', scores_i_j$method), ]

      # Create top1 column indicating which predictors top1 models should be derived
      scores_i_j_single_top1$top1[grepl('UKB', scores_i_j_single_top1$name, ignore.case = F)] <- 'EUR'
      scores_i_j_single_top1$top1[grepl(disc_pop_j, scores_i_j_single_top1$name, ignore.case = F)] <- disc_pop_j_2
      
      ####
      # Make groups containing pseudo scores for single source methods
      ####

      # Extract the pseudo score for each method and specify as a separate group
      for(i in 1:nrow(scores_i_j_single_top1)) {
        param <- find_pseudo(
          config = config,
          gwas = scores_i_j_single_top1$name[i],
          pgs_method = scores_i_j_single_top1$method[i],
          target_pop = targ_pop_i
        )
        
        score_header <-
          fread(scores_i_j_single_top1$predictor[i], nrows = 1)
        score_cols <-
          which(names(score_header) %in% c('FID', 'IID', paste0(scores_i_j_single_top1$name[i], '_', param)))
        
        system(
          paste0(
            "cut -d' ' -f ", 
            paste0(score_cols, collapse=','),
            " ", 
            scores_i_j_single_top1$predictor[i], 
            " > ", 
            gsub('.profiles',
                 paste0('.', targ_pop_i, '_pseudo.profiles'),
                 scores_i_j_single_top1$predictor[i])
          )
        )
      }
      
      scores_i_j_single_pseudo <- scores_i_j_single_top1
      scores_i_j_single_pseudo$multi <- paste0(scores_i_j_single_pseudo$multi, '.pseudo')

      scores_i_j_single_pseudo$predictor <- gsub('.profiles', 
                                    paste0('.', targ_pop_i, '_pseudo.profiles'),
                                    scores_i_j_single_pseudo$predictor)

      ####
      # Make groups for multi-single-source pseudo scores
      ####
      
      scores_i_j_multi_single_pseudo <- scores_i_j[grepl('_multi$', scores_i_j$method),]

      # Extract the pseudo score for each method and specify as a separate group
      for(i in 1:nrow(scores_i_j_multi_single_pseudo)) {
        param <- find_pseudo(
          config = config,
          gwas = scores_i_j_multi_single_pseudo$name[i],
          pgs_method = scores_i_j_multi_single_pseudo$method[i],
          target_pop = targ_pop_i
        )
        
        score_header <-
          fread(scores_i_j_multi_single_pseudo$predictor[i], nrows = 1)
        score_cols <-
          which(names(score_header) %in% c('FID', 'IID', paste0(scores_i_j_multi_single_pseudo$name[i], '_', param)))
        
        system(
          paste0(
            "cut -d' ' -f ", 
            paste0(score_cols, collapse=','),
            " ", 
            scores_i_j_multi_single_pseudo$predictor[i], 
            " > ", 
            gsub('.profiles',
                 paste0('.', targ_pop_i, '_pseudo.profiles'),
                 scores_i_j_multi_single_pseudo$predictor[i])
          )
        )
      }
      
      scores_i_j_multi_single_pseudo$multi <- paste0(scores_i_j_multi_single_pseudo$multi, '.pseudo')

      scores_i_j_multi_single_pseudo$predictor <- gsub('.profiles', 
                                    paste0('.', targ_pop_i, '_pseudo.profiles'),
                                    scores_i_j_multi_single_pseudo$predictor)
      
      scores_i_j_multi_single_pseudo$top1<-paste0('EUR_', disc_pop_j_2)

      ####
      # Make groups for the Multi-Source methods
      ####
      
      scores_i_j_multi <- scores_i_j[(scores_i_j$method %in% pgs_group_methods),]

      # Split top1 scores by target population
      # This doesn't apply to xwing because it only has pop-specific pseudo scores
      scores_i_j_multi_top1<-NULL
      for(i in 1:which(scores_i_j_multi$method %in% c('prscsx'))){
        score_header<-fread(scores_i_j_multi$predictor[i], nrow = 1)
        
        for(pop in c('EUR', disc_pop_j_2)){
          
          if(scores_i_j_multi$method[i] == 'prscsx'){
            score_cols <-
              which(grepl(paste0('^FID$|^IID$|_', pop, '_phi'), names(score_header)))
          }
          if(scores_i_j_multi$method[i] == 'xwing'){
            score_cols <-
              which(grepl(paste0('^FID$|^IID$|_targ_', pop, '_pst'), names(score_header)))
          }
          
          system(
            paste0(
              "cut -d' ' -f ", 
              paste0(score_cols, collapse=','),
              " ", 
              scores_i_j_multi$predictor[i], 
              " > ", 
              gsub('.profiles',
                   paste0('.', pop, '_grid.profiles'),
                   scores_i_j_multi$predictor[i])
            )
          )
          
          tmp <- scores_i_j_multi[i,]
          tmp$multi <- paste0(tmp$multi, '.grid')
          tmp$top1 <- pop
          tmp$predictor <-
              gsub('.profiles',
                   paste0('.', pop, '_grid.profiles'),
                   scores_i_j_multi$predictor[i])
          
          scores_i_j_multi_top1 <- rbind(scores_i_j_multi_top1, tmp)
        }
      }

      # Split pop-specific pseudo scores by target population
      scores_i_j_multi_pop_pseudo<-NULL
      for(i in 1:nrow(scores_i_j_multi)){
        score_header<-fread(scores_i_j_multi$predictor[i], nrow = 1)
        
        for(pop in c('EUR', disc_pop_j_2)){
          if(scores_i_j_multi$method[i] == 'prscsx'){
            score_cols <-
              which(grepl(paste0('^FID$|^IID$|_', pop, '_phi_auto'), names(score_header)))
          }
          if(scores_i_j_multi$method[i] == 'xwing'){
            score_cols <-
              which(grepl(paste0('^FID$|^IID$|_targ_', pop, '_pst_', pop), names(score_header)))
          }
          
          system(
            paste0(
              "cut -d' ' -f ", 
              paste0(score_cols, collapse=','),
              " ", 
              scores_i_j_multi$predictor[i], 
              " > ", 
              gsub('.profiles',
                   paste0('.', pop, '_pseudo.profiles'),
                   scores_i_j_multi$predictor[i])
            )
          )
          
          tmp <- scores_i_j_multi[i,]
          tmp$multi <- paste0(tmp$multi, '.pop_pseudo')
          tmp$top1 <- pop
          tmp$predictor <-
              gsub('.profiles',
                   paste0('.', pop, '_pseudo.profiles'),
                   scores_i_j_multi$predictor[i])
          
          scores_i_j_multi_pop_pseudo <- rbind(scores_i_j_multi_pop_pseudo, tmp)
        }
      }
      
      # Create pseudo score for multi-source methods
      scores_i_j_multi_pseudo<-NULL
      for(i in 1:nrow(scores_i_j_multi)) {
        param <- find_pseudo(
          config = config,
          gwas = scores_i_j_multi$name[i],
          pgs_method = scores_i_j_multi$method[i],
          target_pop = targ_pop_i
        )
        
        score_header <-
          fread(scores_i_j_multi$predictor[i], nrows = 1)
        score_cols <-
          which(names(score_header) %in% c('FID', 'IID', paste0(scores_i_j_multi$name[i], '_', param)))

        system(
          paste0(
            "cut -d' ' -f ", 
            paste0(score_cols, collapse=','),
            " ", 
            scores_i_j_multi$predictor[i], 
            " > ", 
            gsub('.profiles',
                 paste0('.pseudo.targ_', targ_pop_i,'.profiles'),
                 scores_i_j_multi$predictor[i])
          )
        )
        
        tmp <- scores_i_j_multi[i,]
        tmp$multi <- paste0(tmp$multi, '.pseudo')
        tmp$top1 <- paste0('EUR_', disc_pop_j_2)
        tmp$predictor <-
            gsub('.profiles',
                 paste0('.pseudo.targ_', targ_pop_i,'.profiles'),
                 scores_i_j_multi$predictor[i])
        
        scores_i_j_multi_pseudo <- rbind(scores_i_j_multi_pseudo, tmp)
      }
      
      ####
      # Combine the different predictor groups
      ####
      predictors_i<- do.call(rbind, list(
        scores_i_j_single_top1, 
        scores_i_j_single_pseudo,
        scores_i_j_multi_single_pseudo,
        scores_i_j_multi_top1,
        scores_i_j_multi_pop_pseudo,
        scores_i_j_multi_pseudo
      ))
      
      predictors_i <- predictors_i[, c('predictor', 'multi','top1'), with=F]
      
      ####
      # Make a group that will combined all population specific PGS
      ####
      
      predictors_i_all <- predictors_i[predictors_i$top1 %in% c('EUR','AFR','EAS'),]
      predictors_i_all$multi <- 'all'
      predictors_i<-rbind(predictors_i, predictors_i_all)
      
      write.table(
        predictors_i,
        paste0(
          '/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/targ_',
          targ_pop_i,
          '.disc_EUR_',
          disc_pop_j_2,
          '/',
          trait_i,
          '/predictor_list.txt'
        ),
        col.names = T,
        row.names = F,
        quote = F
      )
    }
  }
}

Run model_builder

cd /users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline
conda activate model_builder

#rm /users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/targ_*.disc_EUR_*/*/res*

for targ_pop in $(echo AFR); do
  if [ "$targ_pop" == "EUR" ]; then
      targ_pop2="EUR_test"
  else
      targ_pop2=$targ_pop
  fi
  
  if [ "$targ_pop" == "EUR" ]; then
    disc_pop=$(echo EAS AFR)
  fi
  
  if [ "$targ_pop" == "EAS" ]; then
    disc_pop="EAS"
  fi
  
  if [ "$targ_pop" == "AFR" ]; then
    disc_pop="AFR"
  fi
  
  for disc_pop_i in ${disc_pop}; do
    for pheno in $(cat /users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/trait_subset.txt); do
      if [ ! -f "/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/targ_${targ_pop}.disc_EUR_${disc_pop_i}/${pheno}/res.pred_comp.txt" ]; then
        sbatch --mem 10G -n 5 --exclude=erc-hpc-comp058 -p neurohack_cpu,interruptible_cpu -t 1:00:00 --wrap="Rscript ../Scripts/model_builder/model_builder_top1.R \
          --outcome /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/${pheno}.unrel.${targ_pop2}.row_number.txt \
          --predictors /users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/targ_${targ_pop}.disc_EUR_${disc_pop_i}/${pheno}/predictor_list.txt \
          --out /users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/targ_${targ_pop}.disc_EUR_${disc_pop_i}/${pheno}/res \
          --n_core 5"
      fi
    done
  done
done

Plot results

setwd('/users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline/')

library(data.table)
library(ggplot2)
library(cowplot)

source('../functions/misc.R')
source_all('../functions')

# Read in list of outcomes 
selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/trait_subset.txt', header=F)$V1
selected_traits <- selected_traits[selected_traits != 'NEU']
info_all <- fread('~/oliverpainfel/Analyses/crosspop/gwas_descriptives.csv')

# Calculate correlation between all phenotypes in each target population
cors <- list()
for(pop_i in c('EUR','EAS','AFR','CSA','AMR')){
  if(pop_i == 'EUR'){
    pop_i_2 <- 'EUR_test'
  } else {
    pop_i_2 <- pop_i
  }
  pheno_pop_i <- list()
  for(pheno_i in selected_traits){
    pheno_pop_i[[pheno_i]] <- fread(paste0('/users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/', pheno_i, '.unrel.', pop_i_2, '.row_number.txt'))
    names(pheno_pop_i[[pheno_i]])[3] <- pheno_i
  }
  
  pheno_pop_i_merged <- merged_df <- Reduce(function(x, y) merge(x, y, all = TRUE, by = c('FID','IID')), pheno_pop_i)

  cors_i <- abs(cor(as.matrix(pheno_pop_i_merged[,-1:-2, with=F]), use='p'))
  cors[[pop_i]] <- cors_i
}

# Read in results
targ_pop = c('AFR')
res_eval <- list()
for(pheno_i in selected_traits){
  res_eval_i<-NULL
  for(targ_pop_i in targ_pop){
    if(targ_pop_i == 'EAS'){
      disc_pop <- 'EAS'
    }
    if(targ_pop_i == 'AFR'){
      disc_pop <- 'AFR'
    }
    if(targ_pop_i == 'EUR'){
      disc_pop <- c('EAS','AFR')
    }
    for(disc_pop_i in disc_pop){
      eval_i <-
        fread(
          paste0(
            '/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/',
            'targ_',
            targ_pop_i,
            '.disc_EUR_',
            disc_pop_i,
            '/',
            pheno_i,
            '/res.pred_eval.txt'
          )
        )
      eval_i$Target<-targ_pop_i
      eval_i$gwas_group<-paste0('EUR+', disc_pop_i)
      res_eval_i<-rbind(res_eval_i, eval_i)
    }
  }
  
  res_eval_i$Method<-sub('\\..*','',res_eval_i$Group)
  res_eval_i$Method<-gsub('-.*','', res_eval_i$Method)
  
  res_eval_i$Model[grepl('top1$', res_eval_i$Group) &
                   !grepl('pseudo', res_eval_i$Group)]<-'IndivTune'
  res_eval_i$Model[grepl('top1$', res_eval_i$Group) &
                   grepl('pseudo', res_eval_i$Group)]<-'SumStatTune'
  res_eval_i$Model[grepl('multi$', res_eval_i$Group) &
                   !grepl('pseudo', res_eval_i$Group)]<-'Multi-IndivTune'
  res_eval_i$Model[grepl('multi$', res_eval_i$Group) &
                   grepl('pseudo', res_eval_i$Group)]<-'Multi-SumStatTune'
  
  res_eval_i$Model[grepl('_multi', res_eval_i$Group)]<-'SumStatTune'
  res_eval_i$Model[res_eval_i$Group == 'prscsx.pseudo.multi']<-'SumStatTune'
  res_eval_i$Model[res_eval_i$Group == 'xwing.pseudo.multi']<-'SumStatTune'
  
  res_eval_i$Source<-ifelse(
    res_eval_i$Method %in% pgs_group_methods | grepl('_multi$', res_eval_i$Method) | 
    !grepl('EUR|EAS|AFR', res_eval_i$Group), 'Multi', 'Single')
  
  res_eval_i$Discovery[grepl('EUR', res_eval_i$Group)] <- 'EUR'
  res_eval_i$Discovery[grepl('EAS', res_eval_i$Group)] <- 'EAS'
  res_eval_i$Discovery[grepl('AFR', res_eval_i$Group)] <- 'AFR'
  res_eval_i$Discovery[res_eval_i$Source == 'Multi'] <- res_eval_i$gwas_group[res_eval_i$Source == 'Multi']
  
  res_eval_i$Method<-factor(res_eval_i$Method, levels=unique(res_eval_i$Method))
  res_eval_i$Model<-factor(res_eval_i$Model, levels=c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
  res_eval_i$Discovery<-factor(res_eval_i$Discovery, levels=c('AFR','EAS','EUR','EUR+AFR','EUR+EAS'))

  # Remove IndivTune and Multi-IndivTune model for groups that contain one score (aka QuickPRS and SBayesRC)
  res_eval_i <- res_eval_i[
    !(res_eval_i$Method %in%  c('quickprs','sbayesrc') & 
      res_eval_i$Model %in% c('IndivTune','Multi-IndivTune')),]
  
  # Remove pseudo model for methods that don't really have one 
  res_eval_i <- res_eval_i[
    !(res_eval_i$Method %in%  c('ptclump','ptclump_multi') & 
      res_eval_i$Model %in% c('SumStatTune','Multi-SumStatTune')),]

  # Remove top1 models for *-Multi, PRS-CSx, X-wing
  res_eval_i <- res_eval_i[
    !((res_eval_i$Method %in%  c('prscsx', 'xwing') | grepl('_multi$', res_eval_i$Method)) & 
      grepl('top1', res_eval_i$Group)),]
  
  # Remove any duplicate models
  res_eval_i <- res_eval_i[!duplicated(res_eval_i[, c(
    "Target", "Method", "Model", "Source", "Discovery","gwas_group"
  )]),]
  
  res_eval[[pheno_i]]<-res_eval_i
  
}

# Create vector defining or of methods in plots
model_order <- c("DBSLMM", "lassosum", "LDpred2", "MegaPRS", "PRS-CS", "pT+clump", "QuickPRS", "SBayesRC", "DBSLMM-multi", "lassosum-multi", "LDpred2-multi", "MegaPRS-multi", "PRS-CS-multi", "pT+clump-multi", "QuickPRS-multi", "SBayesRC-multi", "PRS-CSx", "X-Wing", "All") 

res_eval_simp <- NULL
for(pheno_i in selected_traits){
  tmp <- res_eval[[pheno_i]]
  tmp$Trait <- pheno_i
  
  # Insert nice PGS method names
  tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x = T)
  tmp$label[is.na(tmp$label)] <- 'All'
  tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'] <- paste0(tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'], '-multi')
  tmp$label <- factor(tmp$label, levels = model_order)
  
  # Simplify result to either SumStatTune or IndivTune
  tmp$Model[tmp$Model != 'SumStatTune'] <- 'IndivTune'
  tmp$Model[tmp$Model == 'SumStatTune'] <- 'SumStatTune'
  tmp <- tmp[!duplicated(tmp[, c('label','Target','Discovery','Model'), with=F]),]
  
  res_eval_simp <- rbind(res_eval_simp, tmp)
}

# Plot results for each phenotype separately
dir.create('~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/plots')

per_trait_plot <- list()
for(pheno_i in selected_traits){
  tmp <- res_eval_simp[res_eval_simp$Trait == pheno_i,]
  #tmp <- tmp[tmp$Target != 'EUR',]
  tmp$Discovery_clean <- as.character(tmp$Discovery)
  tmp$Discovery_clean <- paste0(tmp$Discovery_clean, ' GWAS')
  tmp$Target <- paste0(tmp$Target, ' Target')

  png(paste0('~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/plots/', pheno_i,'.png'), res=300, width = 3400, height = 2000, units = 'px')
  plot_tmp<-ggplot(tmp, aes(x=label, y=R , fill = Model)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x=NULL, fill = NULL, title = info_all$`Trait Description`[info_all$`Trait Label` == pheno_i]) +
    facet_grid(Target ~ Discovery_clean, scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
          legend.position = "top",
          legend.key.spacing.x = unit(1, "cm"),
          legend.justification = "center")
  print(plot_tmp)
  dev.off()
  per_trait_plot[[pheno_i]]<-plot_tmp
}

tmp <- res_eval_simp
tmp$Discovery_clean <- as.character(tmp$Discovery)
tmp$Discovery_clean <- paste0(tmp$Discovery_clean, ' GWAS')

png(paste0('~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/plots/per_trait_part1.png'), res=300, width = 3000, height = 3800, units = 'px')
ggplot(tmp[tmp$Trait %in% selected_traits[order(selected_traits)][1:5],], aes(x=label, y=R , fill = Model)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x=NULL, fill = NULL) +
    facet_grid(Trait ~ Discovery_clean, scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
          legend.position = "top",
          legend.key.spacing.x = unit(1, "cm"),
          legend.justification = "center")
dev.off()

png(paste0('~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/plots/per_trait_part2.png'), res=300, width = 3000, height = 3800, units = 'px')
ggplot(tmp[tmp$Trait %in% selected_traits[order(selected_traits)][6:10],], aes(x=label, y=R , fill = Model)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x=NULL, fill = NULL) +
    facet_grid(Trait ~ Discovery_clean, scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
          legend.position = "top",
          legend.key.spacing.x = unit(1, "cm"),
          legend.justification = "center")
dev.off()


####
# Average results across phenotypes
####

library(MAd)

# Average R across phenotypes
meta_res_eval <- NULL
for(targ_pop_i in targ_pop){
  if(targ_pop_i == 'EAS'){
    disc_pop <- 'EAS'
  }
  if(targ_pop_i == 'AFR'){
    disc_pop <- 'AFR'
  }
  if(targ_pop_i == 'EUR'){
    disc_pop <- c('EAS','AFR')
  }
  for(disc_pop_i in disc_pop){
  
    # Subset res_eval for each scenario
    res_eval_i <- do.call(rbind, lapply(seq_along(res_eval), function(i) {
      x <- res_eval[[i]]
      x$pheno <- names(res_eval)[i]
      x <- x[x$Target == targ_pop_i]
      x <- x[x$gwas_group == paste0('EUR+', disc_pop_i)]
    }))
    
    # Average res_evalults for each test across phenotypes
    # Use MAd to account for correlation between them
    res_eval_i$Sample<-'A'
  
    for(group_i in unique(res_eval_i$Group)){
      res_eval_group_i <- res_eval_i[res_eval_i$Group == group_i,]
      missing_pheno <-
        colnames(cors[[targ_pop_i]])[!(colnames(cors[[targ_pop_i]]) %in% unique(res_eval_group_i$pheno))]
      
      if (!all(colnames(cors[[targ_pop_i]]) %in% unique(res_eval_group_i$pheno))) {
        print(paste0(
          'res_evalults missing for ',
          targ_pop_i,
          ' ',
          group_i,
          ' ',
          paste0(missing_pheno, collapse = ' ')
        ))
      }
      
      cors_i <- cors[[targ_pop_i]][unique(res_eval_group_i$pheno), unique(res_eval_group_i$pheno)]
      
      meta_res_eval_i <-
        agg(
          id = Sample,
          es = R,
          var = SE ^ 2,
          cor = cors_i,
          method = "BHHR",
          mod = NULL,
          data = res_eval_group_i
        )
      
      tmp <- data.table(Group = group_i,
                        Method = res_eval_group_i$Method[1],
                        Model = res_eval_group_i$Model[1],
                        Source = res_eval_group_i$Source[1],
                        Discovery = res_eval_group_i$Discovery[1],
                        gwas_group = res_eval_group_i$gwas_group[1],
                        Target = targ_pop_i,
                        R = meta_res_eval_i$es,
                        SE = sqrt(meta_res_eval_i$var))
      
      meta_res_eval <- rbind(meta_res_eval, tmp)
    }
  }
}

meta_res_eval$Model<-factor(meta_res_eval$Model, levels=c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
meta_res_eval$Discovery<-factor(meta_res_eval$Discovery, levels=c('AFR','EAS','EUR','EUR+AFR','EUR+EAS'))

write.csv(meta_res_eval, '~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/r_eval.csv', row.names = F)

# Plot average performance across phenotypes for AFR and EAS targets
tmp <- meta_res_eval
tmp <- tmp[tmp$Target != 'EUR',]
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'] <- paste0(tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'], '-multi')
tmp$label <- factor(tmp$label, levels = model_order)
tmp$Discovery_clean <- as.character(tmp$Discovery)
tmp$Discovery_clean[tmp$Discovery == 'EUR'] <- 'EUR GWAS'
tmp$Discovery_clean[tmp$Discovery != 'EUR' & tmp$Source == 'Single'] <- 'Target-matched GWAS'
tmp$Discovery_clean[tmp$Discovery != 'EUR' & tmp$Source == 'Multi'] <- 'Both'
tmp$Discovery_clean <- factor(tmp$Discovery_clean, 
                              levels = c('Target-matched GWAS',
                                         'EUR GWAS',
                                         'Both'))
tmp$Target <- paste0(tmp$Target, ' Target')
tmp$Model[tmp$Model != 'SumStatTune'] <- 'IndivTune'
tmp$Model[tmp$Model == 'SumStatTune'] <- 'SumStatTune'
tmp <- tmp[!duplicated(tmp[, c('label','Target','Discovery_clean','Model'), with=F]),]

png(paste0('~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/plots/average_r.png'), res=300, width = 3200, height = 2000, units = 'px')
ggplot(tmp, aes(x=label, y=R , fill = Model)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x='Method') +
    facet_grid(Target ~ Discovery_clean, scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
          legend.position = "top",
          legend.key.spacing.x = unit(1, "cm"),
          legend.justification = "center")
dev.off()

# Plot performance of -multi models trained using LEOPARD vs using indiv-level data
tmp <- meta_res_eval
tmp <- tmp[tmp$Target != 'EUR',]
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method')
tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods)] <- paste0(tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods)], '-multi')
tmp$label <- factor(tmp$label, levels = unique(tmp$label[order(!(grepl('Multi', tmp$label)), tmp$label)]))
tmp<-tmp[grepl('multi', tmp$label),]
tmp <- tmp[tmp$Model != 'Multi-IndivTune',]
tmp$Model<-as.character(tmp$Model)
tmp$Model[tmp$Model != 'SumStatTune']<-'IndivTune'
tmp$Model[tmp$Model == 'SumStatTune']<-'LEOPARD'
tmp$Target <- paste0(tmp$Target, ' Target')

png(paste0('~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/plots/average_r_leopard.png'), res=300, width = 1500, height = 2000, units = 'px')
ggplot(tmp, aes(x=label, y=R , fill = Model)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x='Method') +
    facet_grid(Target ~., scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
          legend.position = "top",
          legend.key.spacing.x = unit(1, "cm"),
          legend.justification = "center")
dev.off()

# Make simplified plot
# Just show performance when using IndivTrain (or SumStat), and Remove 'All' model, with both GWAS.
tmp <- meta_res_eval
tmp <- tmp[tmp$Target != 'EUR',]
tmp <- tmp[tmp$Method != 'all',]
tmp <- tmp[tmp$Source  == 'Multi',]
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x = T)
tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'] <- paste0(tmp$label[grepl('Multi', tmp$Model) & !(tmp$Method %in% pgs_group_methods) & tmp$label != 'All'], '-multi')
tmp$label <- factor(tmp$label, levels = model_order)
tmp$Discovery_clean <- as.character(tmp$Discovery)
tmp$Discovery_clean[tmp$Discovery == 'EUR'] <- 'EUR GWAS'
tmp$Discovery_clean[tmp$Discovery != 'EUR' & tmp$Source == 'Single'] <- 'Target-matched GWAS'
tmp$Discovery_clean[tmp$Discovery != 'EUR' & tmp$Source == 'Multi'] <- 'Both'
tmp$Discovery_clean <- factor(tmp$Discovery_clean, 
                              levels = c('Target-matched GWAS',
                                         'EUR GWAS',
                                         'Both'))
tmp$Target <- paste0(tmp$Target, ' Target')
tmp$Model[tmp$Model != 'SumStatTune'] <- 'IndivTune'
tmp$Model[tmp$Model == 'SumStatTune'] <- 'SumStatTune'
tmp <- tmp[!duplicated(tmp[, c('label','Target','Discovery_clean','Model'), with=F]),]
tmp<-tmp[tmp$Model == 'IndivTune',]

png(paste0('~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/plots/average_r_simple.png'), res=300, width = 3200, height = 2000, units = 'px')
ggplot(tmp, aes(x=label, y=R)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23, fill = 'black') +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x='Method') +
    facet_grid(Target ~ ., scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
dev.off()

tmp<-tmp[tmp$Method %in% c('ldpred2','prscsx','xwing'),]
png(paste0('~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/plots/average_r_simple_ldpred2.png'), res=300, width = 500, height = 500, units = 'px')
ggplot(tmp, aes(x=label, y=R)) +
    geom_errorbar(aes(ymin = R - SE, ymax = R + SE),
                  width = 0,
                  position = position_dodge(width = 1)) +
  #  geom_point(stat="identity", position=position_dodge(1), fill = '#3399FF') +
    geom_point(stat="identity", position=position_dodge(1), size=3, shape=23, fill = '#3399FF') +
    geom_vline(xintercept = seq(1.5, length(unique(tmp$label))), linetype="dotted") +
    labs(y = "R (SE)", x='Method') +
    facet_grid(Target ~ ., scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
dev.off()


####
# Create heatmap showing difference between all methods and models
####

# Create a function to mirror pred_comp results
mirror_comp<-function(x){
  x_sym <- x
  x_sym$Model_1 <- x$Model_2
  x_sym$Model_2 <- x$Model_1
  x_sym$Model_1_R <- x$Model_2_R
  x_sym$Model_2_R <- x$Model_1_R
  x_sym$R_diff <- -x_sym$R_diff
  x_mirrored <- rbind(x, x_sym)
  x_diag<-data.frame(
      Model_1=unique(x_mirrored$Model_1),
      Model_2=unique(x_mirrored$Model_1),
      Model_1_R=x_mirrored$Model_1_R,
      Model_2_R=x_mirrored$Model_1_R,
      R_diff=NA,
      R_diff_pval=NA
    )
  x_comp<-rbind(x_mirrored, x_diag)
  return(x_comp)
}
  
# Read in results
targ_pop=c('AFR')
res_comp <- list()
for(pheno_i in selected_traits){
  res_comp_i<-NULL
  for(targ_pop_i in targ_pop){
    if(targ_pop_i == 'EAS'){
      disc_pop <- 'EAS'
    }
    if(targ_pop_i == 'AFR'){
      disc_pop <- 'AFR'
    }
    if(targ_pop_i == 'EUR'){
      disc_pop <- c('EAS','AFR')
    }
    for(disc_pop_i in disc_pop){
      comp_i <-
        fread(
          paste0(
            '/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/',
            'targ_',
            targ_pop_i,
            '.disc_EUR_',
            disc_pop_i,
            '/',
            pheno_i,
            '/res.pred_comp.txt'
          )
        )
      comp_i<-mirror_comp(comp_i)
      comp_i$Target<-targ_pop_i
      comp_i$gwas_group<-paste0('EUR+', disc_pop_i)
      res_comp_i<-rbind(res_comp_i, comp_i)
    }
  }
  
  res_comp[[pheno_i]]<-res_comp_i
}

res_comp_all <- do.call(rbind, lapply(names(res_comp), function(name) {
  x <- res_comp[[name]]
  x$pheno <- name  # Add a new column with the name of the element
  x  # Return the updated dataframe
}))

# Annotate tests to get order correct
res_comp_all$Method1<-sub('\\..*','',res_comp_all$Model_1)
res_comp_all$Method1<-gsub('-.*','', res_comp_all$Method1)
res_comp_all$Method2<-sub('\\..*','',res_comp_all$Model_2)
res_comp_all$Method2<-gsub('-.*','', res_comp_all$Method2)

find_model<-function(x){
  mod <- x
  mod[grepl('top1$', x) & !grepl('pseudo', x)] <- 'IndivTune'
  mod[grepl('top1$', x) & grepl('pseudo', x)] <- 'SumStatTune'
  mod[grepl('multi$', x) & !grepl('pseudo', x)] <- 'Multi-IndivTune'
  mod[grepl('multi$', x) & grepl('pseudo', x)] <- 'Multi-SumStatTune'
  mod[grepl('_multi', x)] <- 'SumStatTune'
  mod[x == 'prscsx.pseudo.multi'] <- 'SumStatTune'
  mod[x == 'xwing.pseudo.multi'] <- 'SumStatTune'
  
  return(mod)
}

res_comp_all$Model1<-find_model(res_comp_all$Model_1)
res_comp_all$Model2<-find_model(res_comp_all$Model_2)

res_comp_all$Source1<-ifelse(res_comp_all$Method1 %in% pgs_group_methods | grepl('_multi$', res_comp_all$Method1) | !grepl('AFR|EAS|EUR', res_comp_all$Model_1), 'Multi', 'Single')
res_comp_all$Source2<-ifelse(res_comp_all$Method2 %in% pgs_group_methods | grepl('_multi$', res_comp_all$Method2) | !grepl('AFR|EAS|EUR', res_comp_all$Model_2), 'Multi', 'Single')
  
for(i in c('EUR','EAS','AFR')){
  res_comp_all$Discovery1[grepl(i, res_comp_all$Model_1)] <- i
  res_comp_all$Discovery2[grepl(i, res_comp_all$Model_2)] <- i
}
res_comp_all$Discovery1[res_comp_all$Source1 == 'Multi'] <- res_comp_all$gwas_group[res_comp_all$Source1 == 'Multi']
res_comp_all$Discovery2[res_comp_all$Source2 == 'Multi'] <- res_comp_all$gwas_group[res_comp_all$Source2 == 'Multi']

res_comp_all$Method1<-factor(res_comp_all$Method1, levels=unique(res_comp_all$Method1))
res_comp_all$Method2<-factor(res_comp_all$Method2, levels=unique(res_comp_all$Method2))
res_comp_all$Model1<-factor(res_comp_all$Model1, levels=c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
res_comp_all$Model2<-factor(res_comp_all$Model2, levels=c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))
res_comp_all$Discovery1<-factor(res_comp_all$Discovery1, levels=rev(c('AFR','EAS','EUR','EUR+AFR','EUR+EAS')))
res_comp_all$Discovery2<-factor(res_comp_all$Discovery2, levels=c('AFR','EAS','EUR','EUR+AFR','EUR+EAS'))

# Remove IndivTune and Multi-IndivTune model for groups that contain one score (aka QuickPRS and SBayesRC)
res_comp_all <- res_comp_all[
!(res_comp_all$Method1 %in%  c('quickprs','sbayesrc') & 
  res_comp_all$Model1 %in% c('IndivTune','Multi-IndivTune')),]
res_comp_all <- res_comp_all[
!(res_comp_all$Method2 %in%  c('quickprs','sbayesrc') & 
  res_comp_all$Model2 %in% c('IndivTune','Multi-IndivTune')),]

# Remove pseudo model for methods that don't really have one 
res_comp_all <- res_comp_all[
!(res_comp_all$Method1 %in%  c('ptclump','ptclump_multi') & 
  res_comp_all$Model1 %in% c('SumStatTune','Multi-SumStatTune')),]
res_comp_all <- res_comp_all[
!(res_comp_all$Method2 %in%  c('ptclump','ptclump_multi') & 
  res_comp_all$Model2 %in% c('SumStatTune','Multi-SumStatTune')),]

# Remove top1 models for PRS-CSx
res_comp_all <- res_comp_all[
!(grepl('prscsx|xwing|_multi', res_comp_all$Method1) & 
  grepl('top1', res_comp_all$Model_1)),]
res_comp_all <- res_comp_all[
!(grepl('prscsx|xwing|_multi', res_comp_all$Method2) & 
  grepl('top1', res_comp_all$Model_2)),]

# Remove any comparisons
res_comp_all <- res_comp_all[!duplicated(res_comp_all[, c("Target", "gwas_group", "Method1", "Model1", "Source1", "Discovery1", "Method2", "Model2", "Source2", "Discovery2",'pheno')]),]

res_comp_all$r_diff_rel <- res_comp_all$R_diff / res_comp_all$Model_2_R

# Calculate relative improvement for ldpred2-multi vs ldpred2 as example
tmp_ldpred2 <- res_comp_all[res_comp_all$Model_1 == 'ldpred2.multi' & 
                    grepl('ldpred2-', res_comp_all$Model_2) &
                    res_comp_all$Target == 'AFR',]
tmp_ldpred2 <- tmp_ldpred2[order(-tmp_ldpred2$Model_2_R),]
tmp_ldpred2 <- tmp_ldpred2[!duplicated(tmp_ldpred2$pheno),]
round(min(tmp_ldpred2$r_diff_rel)*100, 1)
round(max(tmp_ldpred2$r_diff_rel)*100, 1)

tmp_ldpred2 <- res_comp_all[res_comp_all$Model_1 == 'ldpred2.multi' & 
                    grepl('ldpred2-', res_comp_all$Model_2) &
                    res_comp_all$Target == 'EAS',]
tmp_ldpred2 <- tmp_ldpred2[order(-tmp_ldpred2$Model_2_R),]
tmp_ldpred2 <- tmp_ldpred2[!duplicated(tmp_ldpred2$pheno),]
round(min(tmp_ldpred2$r_diff_rel)*100, 1)
round(max(tmp_ldpred2$r_diff_rel)*100, 1)

# Calculate relative improvement for sbayesrc-multi vs sbayesrc in EUR target as example
tmp_sbayesrc <- res_comp_all[res_comp_all$Model_1 == 'sbayesrc.pseudo.multi' & 
                    grepl('sbayesrc.pseudo-', res_comp_all$Model_2) &
                    res_comp_all$Target == 'EUR' &
                    res_comp_all$Discovery1 == 'EUR+EAS' &
                    res_comp_all$Discovery2 == 'EUR',]
tmp_sbayesrc <- tmp_sbayesrc[order(-tmp_sbayesrc$Model_2_R),]
round(min(tmp_sbayesrc$r_diff_rel)*100, 1)
round(max(tmp_sbayesrc$r_diff_rel)*100, 1)

tmp_sbayesrc <- res_comp_all[res_comp_all$Model_1 == 'sbayesrc.pseudo.multi' & 
                    grepl('sbayesrc.pseudo-', res_comp_all$Model_2) &
                    res_comp_all$Target == 'EUR' &
                    res_comp_all$Discovery1 == 'EUR+AFR' &
                    res_comp_all$Discovery2 == 'EUR',]
tmp_sbayesrc <- tmp_sbayesrc[order(-tmp_sbayesrc$Model_2_R),]
round(min(tmp_sbayesrc$r_diff_rel)*100, 1)
round(max(tmp_sbayesrc$r_diff_rel)*100, 1)

#####
# Export a csv containing difference results for all traits
#####
# Simplify to contain only IndivTune or SumStatTune result
tmp <- res_comp_all
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method1', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
names(tmp)[names(tmp) == 'label'] <- 'label1'
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method2', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
names(tmp)[names(tmp) == 'label'] <- 'label2'

tmp$label1[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label1 != 'All'] <- paste0(tmp$label1[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label1 != 'All'], '-multi')
tmp$label2[grepl('Multi', tmp$Model2) & !(tmp$Method2 %in% pgs_group_methods) & tmp$label2 != 'All'] <- paste0(tmp$label2[grepl('Multi', tmp$Model2) & !(tmp$Method2 %in% pgs_group_methods) & tmp$label2 != 'All'], '-multi')

tmp$Model1[tmp$Model1 != 'SumStatTune'] <- 'IndivTune'
tmp$Model1[tmp$Model1 == 'SumStatTune'] <- 'SumStatTune'
tmp$Model2[tmp$Model2 != 'SumStatTune'] <- 'IndivTune'
tmp$Model2[tmp$Model2 == 'SumStatTune'] <- 'SumStatTune'

tmp<-tmp[tmp$Model_1 %in% res_eval_simp$Group,]
tmp<-tmp[tmp$Model_2 %in% res_eval_simp$Group,]

tmp$`Model 1` <- paste0(tmp$label1, ' - ', tmp$Model1, ' - ', tmp$Discovery1)
tmp$`Model 2` <- paste0(tmp$label2, ' - ', tmp$Model2, ' - ', tmp$Discovery2)

tmp <- tmp[, c('Target', 'pheno', 'Model 1', 'Model 2', 'Model_1_R', 'Model_2_R', 'R_diff', 'R_diff_pval'), with=F]
names(tmp) <- c('Target', 'Trait','Model 1', 'Model 2', "R (Model 1)", "R (Model 2)", "R difference (Model 1 R - Model 2 R)", "R difference p-value")

tmp<-tmp[order(tmp$Target, tmp$Trait, tmp$`Model 1`, tmp$`Model 2`),]
tmp$`R difference (Model 1 R - Model 2 R)` <- round(tmp$`R difference (Model 1 R - Model 2 R)`, 3)
tmp$`R (Model 1)` <- round(tmp$`R (Model 1)`, 3)
tmp$`R (Model 2)` <- round(tmp$`R (Model 2)`, 3)

write.csv(tmp, '~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/r_diff.csv', row.names=F)

###########

library(MAd)

# Average R across phenotypes
meta_res_comp <- NULL
for(targ_pop_i in targ_pop){
  if(targ_pop_i == 'EAS'){
    disc_pop <- 'EAS'
  }
  if(targ_pop_i == 'AFR'){
    disc_pop <- 'AFR'
  }
  if(targ_pop_i == 'EUR'){
    disc_pop <- c('EAS','AFR')
  }
  for(disc_pop_i in disc_pop){
  
    # Subset res_comp for each scenario
    res_comp_i <- res_comp_all[res_comp_all$Target == targ_pop_i & res_comp_all$gwas_group == paste0('EUR+', disc_pop_i)]
  
    # Calculate diff SE based on p-value
    res_comp_i$R_diff_pval[res_comp_i$R_diff == 0] <- 1-0.001
    res_comp_i$R_diff_pval[res_comp_i$R_diff_pval == 1]<-1-0.001
    res_comp_i$R_diff_z<-qnorm(res_comp_i$R_diff_pval/2)
    res_comp_i$R_diff_SE<-abs(res_comp_i$R_diff/res_comp_i$R_diff_z)
        
    # Average results for each test across phenotypes
    # Use MAd to account for correlation between them
    res_comp_i$Sample<-'A'
    res_comp_i$Group <- paste0(res_comp_i$Model_1, '_vs_', res_comp_i$Model_2)
  
    for(group_i in unique(res_comp_i$Group)){
      res_comp_group_i <- res_comp_i[res_comp_i$Group == group_i,]
      cors_i <- cors[[targ_pop_i]][unique(res_comp_group_i$pheno), unique(res_comp_group_i$pheno)]
      
      if(res_comp_group_i$Model_1[1] != res_comp_group_i$Model_2[1]){
        
        meta_res_comp_i <-
          agg(
            id = Sample,
            es = R_diff,
            var = R_diff_SE ^ 2,
            cor = cors_i,
            method = "BHHR",
            mod = NULL,
            data = res_comp_group_i
          )
        
        tmp <- res_comp_group_i[1,]
        tmp$pheno <- NULL
        tmp$Model_1_R <-
          meta_res_eval$R[meta_res_eval$Group == tmp$Model_1 &
                            meta_res_eval$Target == targ_pop_i &
                            meta_res_eval$gwas_group == paste0('EUR+', disc_pop_i)]
        tmp$Model_2_R <-
          meta_res_eval$R[meta_res_eval$Group == tmp$Model_2 &
                            meta_res_eval$Target == targ_pop_i &
                            meta_res_eval$gwas_group == paste0('EUR+', disc_pop_i)]
        tmp$R_diff <- meta_res_comp_i$es
        tmp$R_diff_SE <- sqrt(meta_res_comp_i$var)
        tmp$R_diff_z <- tmp$R_diff / tmp$R_diff_SE
        tmp$R_diff_p <- 2*pnorm(-abs(tmp$R_diff_z))
      } else {
        tmp <- res_comp_group_i[1,]
        tmp$pheno <- NULL
        tmp$R_diff <- NA
        tmp$R_diff_SE <- NA
        tmp$R_diff_z <- NA
        tmp$R_diff_p <- NA
      }
      meta_res_comp <- rbind(meta_res_comp, tmp)
    }
  }
}

meta_res_comp$R_diff_perc <- meta_res_comp$R_diff / meta_res_comp$Model_2_R
  
# Extract average improvement for ldpred2-multi vs ldpred2 as example
tmp_ldpred2 <- meta_res_comp[meta_res_comp$Model_1 == 'ldpred2.multi' & 
                    grepl('ldpred2-', meta_res_comp$Model_2) &
                    meta_res_comp$Target == 'AFR',]
tmp_ldpred2 <- tmp_ldpred2[order(-tmp_ldpred2$Model_2_R),]
round(min(tmp_ldpred2$R_diff_perc)*100, 1)

tmp_ldpred2 <- meta_res_comp[meta_res_comp$Model_1 == 'ldpred2.multi' & 
                    grepl('ldpred2-', meta_res_comp$Model_2) &
                    meta_res_comp$Target == 'EAS',]
tmp_ldpred2 <- tmp_ldpred2[order(-tmp_ldpred2$Model_2_R),]
round(min(tmp_ldpred2$R_diff_perc)*100, 1)

# Extract average improvement for sbayesrc-multi vs sbayesrc in EUR as example
tmp_sbayesrc <- meta_res_comp[meta_res_comp$Model_1 == 'sbayesrc.pseudo.multi' & 
                    grepl('sbayesrc.pseudo', meta_res_comp$Model_2) &
                    meta_res_comp$Target == 'EUR' &
                    meta_res_comp$Discovery1 == 'EUR+AFR' &
                    meta_res_comp$Discovery2 == 'EUR',]
round(tmp_sbayesrc$R_diff_perc*100, 1)

tmp_sbayesrc <- meta_res_comp[meta_res_comp$Model_1 == 'sbayesrc.pseudo.multi' & 
                    grepl('sbayesrc.pseudo', meta_res_comp$Model_2) &
                    meta_res_comp$Target == 'EUR' &
                    meta_res_comp$Discovery1 == 'EUR+EAS' &
                    meta_res_comp$Discovery2 == 'EUR',]
round(tmp_sbayesrc$R_diff_perc*100, 1)

# Extract average improvement for sbayesrc in EUR compared to all model
tmp_sbayesrc <- meta_res_comp[meta_res_comp$Model_2 == 'sbayesrc.pseudo-EUR.top1' &
                    meta_res_comp$Model_1 == 'all-EUR.top1' &
                    meta_res_comp$Target == 'AFR',]
round(tmp_sbayesrc$R_diff_perc*100, 1)
tmp_sbayesrc$R_diff_p

tmp_sbayesrc <- meta_res_comp[meta_res_comp$Model_2 == 'sbayesrc.pseudo-EUR.top1' &
                    meta_res_comp$Model_1 == 'all-EUR.top1' &
                    meta_res_comp$Target == 'EAS',]
round(tmp_sbayesrc$R_diff_perc*100, 1)
tmp_sbayesrc$R_diff_p


# Compare QuickPRS-Multi vs QuickPRS to evaluate LEOPARD performance
tmp_quickprs <- meta_res_comp[meta_res_comp$Model_1 == 'quickprs_multi.pseudo.multi' & 
                                meta_res_comp$Model_2 == 'quickprs.pseudo.multi' &
                    meta_res_comp$Target == 'AFR',]
round(min(tmp_quickprs$R_diff_perc)*100, 1)

tmp_quickprs <- meta_res_comp[meta_res_comp$Model_1 == 'quickprs_multi.pseudo.multi' & 
                                meta_res_comp$Model_2 == 'quickprs.pseudo.multi' &
                    meta_res_comp$Target == 'EAS',]
round(min(tmp_quickprs$R_diff_perc)*100, 1)

# Compare all.multi method to next best method
tmp_all <- meta_res_comp[meta_res_comp$Model_1 == 'all.multi' &
                    meta_res_comp$Target == 'AFR' &
                    meta_res_comp$Source2 == 'Multi',]
tmp_all <- tmp_all[order(tmp_all$R_diff),]
tmp_all <- tmp_all[1,]
round(tmp_all$R_diff_perc*100, 1)
tmp_all$R_diff_p

tmp_all <- meta_res_comp[meta_res_comp$Model_1 == 'all.multi' &
                    meta_res_comp$Target == 'EAS' &
                    meta_res_comp$Source2 == 'Multi',]
tmp_all <- tmp_all[order(tmp_all$R_diff),]
tmp_all <- tmp_all[1,]
round(tmp_all$R_diff_perc*100, 1)
tmp_all$R_diff_p

# Compare all.multi method to next best method
tmp_all <- meta_res_comp[meta_res_comp$Model_1 == 'all-AFR.top1' &
                    meta_res_comp$Target == 'AFR' &
                    meta_res_comp$Discovery1 == 'AFR' &
                    meta_res_comp$Discovery2 == 'AFR',]
tmp_all <- tmp_all[order(tmp_all$R_diff),]
tmp_all <- tmp_all[1,]
round(tmp_all$R_diff_perc*100, 1)
tmp_all$R_diff_p

tmp_all <- meta_res_comp[meta_res_comp$Model_1 == 'all-EAS.top1' &
                    meta_res_comp$Target == 'EAS' &
                    meta_res_comp$Discovery1 == 'EAS' &
                    meta_res_comp$Discovery2 == 'EAS',]
tmp_all <- tmp_all[order(tmp_all$R_diff),]
tmp_all <- tmp_all[1,]
round(tmp_all$R_diff_perc*100, 1)
tmp_all$R_diff_p

#####
# Export a csv containing difference results for all traits
#####
# Simplify to contain only IndivTune or SumStatTune result
tmp <- meta_res_comp
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method1', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
names(tmp)[names(tmp) == 'label'] <- 'label1'
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method2', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
names(tmp)[names(tmp) == 'label'] <- 'label2'

tmp$label1[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label1 != 'All'] <- paste0(tmp$label1[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label1 != 'All'], '-multi')
tmp$label2[grepl('Multi', tmp$Model2) & !(tmp$Method2 %in% pgs_group_methods) & tmp$label2 != 'All'] <- paste0(tmp$label2[grepl('Multi', tmp$Model2) & !(tmp$Method2 %in% pgs_group_methods) & tmp$label2 != 'All'], '-multi')

tmp$Model1[tmp$Model1 != 'SumStatTune'] <- 'IndivTune'
tmp$Model1[tmp$Model1 == 'SumStatTune'] <- 'SumStatTune'
tmp$Model2[tmp$Model2 != 'SumStatTune'] <- 'IndivTune'
tmp$Model2[tmp$Model2 == 'SumStatTune'] <- 'SumStatTune'

tmp<-tmp[tmp$Model_1 %in% res_eval_simp$Group,]
tmp<-tmp[tmp$Model_2 %in% res_eval_simp$Group,]

tmp$`Model 1` <- paste0(tmp$label1, ' - ', tmp$Model1, ' - ', tmp$Discovery1)
tmp$`Model 2` <- paste0(tmp$label2, ' - ', tmp$Model2, ' - ', tmp$Discovery2)

tmp$`Percentage change (R difference / Model 2 R)` <- paste0(round(tmp$R_diff_perc * 100, 1), '%')

tmp <- tmp[, c('Target', 'Model 1', 'Model 2', 'Model_1_R', 'Model_2_R', 'R_diff',"Percentage change (R difference / Model 2 R)", 'R_diff_p'), with=F]
names(tmp) <- c('Target','Model 1', 'Model 2', "R (Model 1)", "R (Model 2)", "R difference (Model 1 R - Model 2 R)", "Percentage change (R difference / Model 2 R)", "R difference p-value")

tmp<-tmp[order(tmp$Target, tmp$`Model 1`, tmp$`Model 2`),]
tmp$`R difference (Model 1 R - Model 2 R)` <- round(tmp$`R difference (Model 1 R - Model 2 R)`, 3)
tmp$`R (Model 1)` <- round(tmp$`R (Model 1)`, 3)
tmp$`R (Model 2)` <- round(tmp$`R (Model 2)`, 3)

write.csv(tmp, '~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/r_diff_average.csv', row.names=F)

############

# Group differences
meta_res_comp$R_diff_catagory <- cut(
    meta_res_comp$R_diff,
    breaks = c(-Inf, -0.08, -0.025, -0.002, 0.002, 0.025, 0.08, Inf),
    labels = c('< -0.08', '-0.08 - -0.025', '-0.025 - -0.002', '-0.002 - 0.002', '0.002 - 0.025', '0.025 - 0.08', '> 0.08'),
    right = FALSE
)
meta_res_comp$R_diff_catagory <- factor(meta_res_comp$R_diff_catagory, levels = rev(levels(meta_res_comp$R_diff_catagory)))

# Assign significance stars
meta_res_comp$indep_star<-' '
meta_res_comp$indep_star[meta_res_comp$R_diff_p < 0.05]<-'*'
meta_res_comp$indep_star[meta_res_comp$R_diff_p < 1e-3]<-'**'
# meta_res_comp$indep_star[meta_res_comp$R_diff_p < 1e-6]<-'***'

meta_res_comp<-meta_res_comp[order(meta_res_comp$Discovery1, meta_res_comp$Discovery2, meta_res_comp$Method1),]

for(targ_pop_i in targ_pop){
  if(targ_pop_i == 'EAS'){
    disc_pop <- 'EAS'
  }
  if(targ_pop_i == 'AFR'){
    disc_pop <- 'AFR'
  }
  if(targ_pop_i == 'EUR'){
    disc_pop <- c('EAS','AFR')
  }
  for(disc_pop_i in disc_pop){

    tmp <- meta_res_comp[meta_res_comp$Target == targ_pop_i, ]

    tmp <- merge(tmp, pgs_method_labels, by.x = 'Method1', by.y = 'method', all.x = T)
    tmp$label[is.na(tmp$label)] <- 'All'
    names(tmp)[names(tmp) == 'label'] <- 'label1'
    tmp <- merge(tmp, pgs_method_labels, by.x = 'Method2', by.y = 'method', all.x = T)
    tmp$label[is.na(tmp$label)] <- 'All'
    names(tmp)[names(tmp) == 'label'] <- 'label2'
    
    tmp$label1[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label1 != 'All'] <- paste0(tmp$label1[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label1 != 'All'], '-multi')
    tmp$label2[grepl('Multi', tmp$Model2) & !(tmp$Method2 %in% pgs_group_methods) & tmp$label2 != 'All'] <- paste0(tmp$label2[grepl('Multi', tmp$Model2) & !(tmp$Method2 %in% pgs_group_methods) & tmp$label2 != 'All'], '-multi')
    
    tmp$Model1[tmp$Model1 != 'SumStatTune'] <- 'IndivTune'
    tmp$Model1[tmp$Model1 == 'SumStatTune'] <- 'SumStatTune'
    tmp$Model2[tmp$Model2 != 'SumStatTune'] <- 'IndivTune'
    tmp$Model2[tmp$Model2 == 'SumStatTune'] <- 'SumStatTune'
    
    tmp<-tmp[tmp$Model_1 %in% res_eval_simp$Group,]
    tmp<-tmp[tmp$Model_2 %in% res_eval_simp$Group,]

    tmp$label1 <- factor(tmp$label1, levels = model_order)
    tmp$label2 <- factor(tmp$label2, levels = model_order)

    tmp<-tmp[order(tmp$label1, tmp$label2),]
    
    tmp$label1 <- paste0(tmp$label1," (", ifelse(tmp$Model1 == 'SumStatTune', 'ST', 'IT'), ")")
    tmp$label2 <- paste0(tmp$label2," (", ifelse(tmp$Model2 == 'SumStatTune', 'ST', 'IT'), ")")

    tmp$label1 <- factor(tmp$label1, levels = unique(tmp$label1))
    tmp$label2 <- factor(tmp$label2, levels = unique(tmp$label2))
    
    tmp <- tmp[tmp$gwas_group == paste0('EUR+', disc_pop_i), ]
    
    plot_tmp <- ggplot(data = tmp, aes(label2, label1, fill = R_diff_catagory)) +
      geom_tile(color = "white", show.legend = TRUE) +
      labs(y = 'Test', x = 'Comparison', fill = 'R difference', title = paste0('Target: ', targ_pop_i)) +
      facet_grid(Discovery1 ~ Discovery2, scales = 'free', space = 'free', switch="both") +
      geom_text(
        data = tmp,
        aes(label2, label1, label = indep_star),
        color = "black",
        size = 4,
        angle = 0,
        vjust = 0.8
      ) +
      scale_fill_brewer(
        breaks = levels(tmp$R_diff_catagory),
        palette = "RdBu",
        drop = F,
        na.value = 'grey'
      ) +
      theme_half_open() +
      background_grid() +
      panel_border() +
      theme(axis.text.x = element_text(
        angle = 45,
        vjust = 1,
        hjust = 1
      ))
    
    png(paste0('~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/plots/average_r_diff.Discovery_EUR_', disc_pop_i,'.Target_', targ_pop_i, '.png'), res=300, width = 4400, height = 3200, units = 'px')
      print(plot_tmp)
    dev.off()
  }
}

####
# Plot relative improvement of methods
####
# Use ptclump IndivTune using EUR GWAS as the reference, as provides an interpretable scale

meta_res_comp_ptclump_top1<-meta_res_comp[meta_res_comp$Method2 == 'all' & meta_res_comp$Source2 == 'Multi',]
meta_res_comp_ptclump_top1$reference_point<-F
meta_res_comp_ptclump_top1$reference_point[meta_res_comp_ptclump_top1$Method1 == 'all' & meta_res_comp_ptclump_top1$Source1 == 'Multi']<-T
meta_res_comp_ptclump_top1$R_diff[is.na(meta_res_comp_ptclump_top1$R_diff)]<-0
meta_res_comp_ptclump_top1$Discovery1 <- factor(meta_res_comp_ptclump_top1$Discovery1, levels=rev(levels(meta_res_comp_ptclump_top1$Discovery1)))

res_comp_all_ptclump_top1<-res_comp_all[res_comp_all$Method2 == 'all' & res_comp_all$Source2 == 'Multi',]
res_comp_all_ptclump_top1$Discovery1 <-  factor(res_comp_all_ptclump_top1$Discovery1, levels=levels(meta_res_comp_ptclump_top1$Discovery1))

# Create data to plot reference points
meta_res_comp_reference <- meta_res_comp_ptclump_top1
meta_res_comp_reference$R_diff[meta_res_comp_ptclump_top1$reference_point == F] <- NA
meta_res_comp_reference$R_diff_SE [meta_res_comp_ptclump_top1$reference_point == F] <- NA
res_comp_all_ptclump_top1$reference_point<-F

meta_tmp <- meta_res_comp_ptclump_top1
meta_tmp <- merge(meta_tmp, pgs_method_labels, by.x = 'Method1', by.y = 'method', all.x = T)
meta_tmp$label[is.na(meta_tmp$label)] <- 'All'
meta_tmp$label[grepl('Multi', meta_tmp$Model1) & !(meta_tmp$Method1 %in% pgs_group_methods) & meta_tmp$label != 'All'] <- paste0(meta_tmp$label[grepl('Multi', meta_tmp$Model1) & !(meta_tmp$Method1 %in% pgs_group_methods) & meta_tmp$label != 'All'], '-multi')
meta_tmp$label <- factor(meta_tmp$label, levels = model_order)
meta_tmp$Discovery_clean <- as.character(meta_tmp$Discovery1)
meta_tmp$Discovery_clean[meta_tmp$Discovery1 == 'EUR'] <- 'EUR GWAS'
meta_tmp$Discovery_clean[meta_tmp$Discovery1 != 'EUR' & meta_tmp$Source1 == 'Single'] <- 'AFR GWAS'
meta_tmp$Discovery_clean[meta_tmp$Discovery1 != 'EUR' & meta_tmp$Source1 == 'Multi'] <- 'EUR + AFR GWAS'
meta_tmp$Discovery_clean <- factor(meta_tmp$Discovery_clean, 
                              levels = c('AFR GWAS',
                                         'EUR GWAS',
                                         'EUR + AFR GWAS'))
meta_tmp$Target <- paste0(meta_tmp$Target, ' Target')
meta_tmp$Model1 <- factor(meta_tmp$Model1, levels = c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))

meta_tmp_ref <- meta_res_comp_reference
meta_tmp_ref <- merge(meta_tmp_ref, pgs_method_labels, by.x = 'Method1', by.y = 'method', all.x = T)
meta_tmp_ref$label[is.na(meta_tmp_ref$label)] <- 'All'
meta_tmp_ref$label[grepl('Multi', meta_tmp_ref$Model1) & !(meta_tmp_ref$Method1 %in% pgs_group_methods) & meta_tmp_ref$label != 'All'] <- paste0(meta_tmp_ref$label[grepl('Multi', meta_tmp_ref$Model1) & !(meta_tmp_ref$Method1 %in% pgs_group_methods) & meta_tmp_ref$label != 'All'], '-multi')
meta_tmp_ref$label <- factor(meta_tmp_ref$label, levels = model_order)
meta_tmp_ref$Discovery_clean <- as.character(meta_tmp_ref$Discovery1)
meta_tmp_ref$Discovery_clean[meta_tmp_ref$Discovery1 == 'EUR'] <- 'EUR GWAS'
meta_tmp_ref$Discovery_clean[meta_tmp_ref$Discovery1 != 'EUR' & meta_tmp_ref$Source1 == 'Single'] <- 'AFR GWAS'
meta_tmp_ref$Discovery_clean[meta_tmp_ref$Discovery1 != 'EUR' & meta_tmp_ref$Source1 == 'Multi'] <- 'EUR + AFR GWAS'
meta_tmp_ref$Discovery_clean <- factor(meta_tmp_ref$Discovery_clean, 
                              levels = c('AFR GWAS',
                                         'EUR GWAS',
                                         'EUR + AFR GWAS'))
meta_tmp_ref$Target <- paste0(meta_tmp_ref$Target, ' Target')
meta_tmp_ref$Model1 <- factor(meta_tmp_ref$Model1, levels = c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))

tmp <- res_comp_all_ptclump_top1
tmp <- merge(tmp, pgs_method_labels, by.x = 'Method1', by.y = 'method', all.x = T)
tmp$label[is.na(tmp$label)] <- 'All'
tmp$label[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label != 'All'] <- paste0(tmp$label[grepl('Multi', tmp$Model1) & !(tmp$Method1 %in% pgs_group_methods) & tmp$label != 'All'], '-multi')
tmp$label <- factor(tmp$label, levels = model_order)
tmp$Discovery_clean <- as.character(tmp$Discovery1)
tmp$Discovery_clean[tmp$Discovery1 == 'EUR'] <- 'EUR GWAS'
tmp$Discovery_clean[tmp$Discovery1 != 'EUR' & tmp$Source1 == 'Single'] <- 'AFR GWAS'
tmp$Discovery_clean[tmp$Discovery1 != 'EUR' & tmp$Source1 == 'Multi'] <- 'EUR + AFR GWAS'
tmp$Discovery_clean <- factor(tmp$Discovery_clean, 
                              levels = c('AFR GWAS',
                                         'EUR GWAS',
                                         'EUR + AFR GWAS'))
tmp$Target <- paste0(tmp$Target, ' Target')
tmp$Model1 <- factor(tmp$Model1, levels = c('IndivTune','SumStatTune','Multi-IndivTune','Multi-SumStatTune'))

ggplot(meta_tmp, aes(x=label, y=R_diff , fill = Model1)) +
    geom_point(
        data = tmp,
        mapping = aes(x=label, y=R_diff, colour=Model1),
        position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.7),
        alpha = 0.3
      ) +
    geom_hline(yintercept = 0) +
      geom_errorbar(
        aes(
          ymin = R_diff - R_diff_SE,
          ymax = R_diff + R_diff_SE
        ),
        width = 0,
        position = position_dodge(width = 0.7)
      ) +    
    geom_point(
      stat = "identity",
      position = position_dodge(0.7),
      size = 3,
      shape = 23
    ) +
    geom_point(
        data = meta_tmp_ref,
        aes(x = label, y = R_diff, fill = Model1),
        stat = "identity",
        position = position_dodge(0.7), # Ensure same dodge as other points
        size = 3,    # Increase size for emphasis
        shape = 22,
        stroke = 1.5,
        show.legend=F
    ) +
    geom_vline(xintercept = seq(1.5, length(unique(meta_tmp$label))), linetype="dotted") +
    labs(y = "R_diff (SE)") +
    facet_grid(Target ~ Discovery_clean, scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid() + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))


# Plot as % change
meta_tmp$R_diff_perc <- meta_tmp$R_diff / meta_tmp$Model_2_R
meta_tmp_ref$R_diff_perc <- meta_tmp_ref$R_diff / meta_tmp_ref$Model_2_R
tmp$R_diff_perc <- tmp$R_diff / tmp$Model_2_R

meta_tmp$R_diff_perc_SE <- meta_tmp$R_diff_SE / meta_tmp$Model_2_R

library(scales)
ggplot(meta_tmp, aes(x=label, y=R_diff_perc , fill = Model1)) +
    geom_point(
        data = tmp,
        mapping = aes(x=label, y=R_diff_perc, colour=Model1),
        position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.7),
        alpha = 0.3
      ) +
    geom_hline(yintercept = 0) +
      geom_errorbar(
        aes(
          ymin = R_diff_perc - R_diff_perc_SE,
          ymax = R_diff_perc + R_diff_perc_SE
        ),
        width = 0,
        position = position_dodge(width = 0.7)
      ) +    
    geom_point(
      stat = "identity",
      position = position_dodge(0.7),
      size = 3,
      shape = 23
    ) +
    geom_point(
        data = meta_tmp_ref,
        aes(x = label, y = R_diff_perc, fill = Model1),
        stat = "identity",
        position = position_dodge(0.7), # Ensure same dodge as other points
        size = 3,    # Increase size for emphasis
        shape = 22,
        stroke = 1.5,
        show.legend=F
    ) +
    scale_y_continuous(labels = percent_format()) +
    geom_vline(xintercept = seq(1.5, length(unique(meta_tmp$label))), linetype="dotted") +
    labs(y = "R diff. (SE)") +
    facet_grid(Target ~ Discovery_clean, scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid() + 
    panel_border() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))

# Simplify results showing results only with or without training data
meta_tmp_simple <- meta_tmp
meta_tmp_simple$Model1[meta_tmp_simple$Model1 != 'SumStatTune'] <- 'IndivTune'
meta_tmp_simple$Model1[meta_tmp_simple$Model1 == 'SumStatTune'] <- 'SumStatTune'
meta_tmp_simple$Model2[meta_tmp_simple$Model2 != 'SumStatTune'] <- 'IndivTune'
meta_tmp_simple$Model2[meta_tmp_simple$Model2 == 'SumStatTune'] <- 'SumStatTune'
meta_tmp_simple<-meta_tmp_simple[meta_tmp_simple$Model_1 %in% res_eval_simp$Group,]
meta_tmp_simple<-meta_tmp_simple[meta_tmp_simple$Model_2 %in% res_eval_simp$Group,]

meta_tmp_ref_simple <- meta_tmp_ref
meta_tmp_ref_simple$Model1[meta_tmp_ref_simple$Model1 != 'SumStatTune'] <- 'IndivTune'
meta_tmp_ref_simple$Model1[meta_tmp_ref_simple$Model1 == 'SumStatTune'] <- 'SumStatTune'
meta_tmp_ref_simple$Model2[meta_tmp_ref_simple$Model2 != 'SumStatTune'] <- 'IndivTune'
meta_tmp_ref_simple$Model2[meta_tmp_ref_simple$Model2 == 'SumStatTune'] <- 'SumStatTune'
meta_tmp_ref_simple<-meta_tmp_ref_simple[meta_tmp_ref_simple$Model_1 %in% res_eval_simp$Group,]
meta_tmp_ref_simple<-meta_tmp_ref_simple[meta_tmp_ref_simple$Model_2 %in% res_eval_simp$Group,]

tmp_simple <- tmp
tmp_simple$Model1[tmp_simple$Model1 != 'SumStatTune'] <- 'IndivTune'
tmp_simple$Model1[tmp_simple$Model1 == 'SumStatTune'] <- 'SumStatTune'
tmp_simple$Model2[tmp_simple$Model2 != 'SumStatTune'] <- 'IndivTune'
tmp_simple$Model2[tmp_simple$Model2 == 'SumStatTune'] <- 'SumStatTune'
tmp_simple<-tmp_simple[tmp_simple$Model_1 %in% res_eval_simp$Group,]
tmp_simple<-tmp_simple[tmp_simple$Model_2 %in% res_eval_simp$Group,]

# Export plot for manuscript
png('~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/plots/average_r.perc_improv.png', width = 3200, height = 1500, res= 300, units = 'px')
ggplot(meta_tmp_simple[meta_tmp_simple$Target != 'EUR Target',], aes(x=label, y=R_diff_perc , fill = Model1)) +
#    geom_boxplot(
#      data = tmp_simple[tmp_simple$Target != 'EUR Target',],
#        mapping = aes(x=label, y=R_diff_perc, colour=Model1),
#        position = position_dodge(0.7),
#        alpha = 0.3
#      ) +
    geom_point(
        data = tmp_simple[tmp_simple$Target != 'EUR Target',],
        mapping = aes(x=label, y=R_diff_perc, colour=Model1),
        position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.7),
        alpha = 0.3
      ) +
    geom_hline(yintercept = 0) +
      geom_errorbar(
        aes(
          ymin = R_diff_perc - R_diff_perc_SE,
          ymax = R_diff_perc + R_diff_perc_SE
        ),
        width = 0,
        position = position_dodge(width = 0.7)
      ) +    
    geom_point(
      stat = "identity",
      position = position_dodge(0.7),
      size = 3,
      shape = 23
    ) +
    geom_point(
        data = meta_tmp_ref_simple[meta_tmp_ref_simple$Target != 'EUR Target',],
        aes(x = label, y = R_diff_perc, fill = Model1),
        stat = "identity",
        position = position_dodge(0.7), # Ensure same dodge as other points
        size = 4,
        shape = 22,
        show.legend=F
    ) +
    geom_vline(xintercept = seq(1.5, length(unique(meta_tmp_simple$label))), linetype="dotted") +
    scale_y_continuous(labels = percent_format()) +
    labs(y = "Relative Improvement (SE)", fill = NULL, colour = NULL, x = NULL) +
    facet_grid(Target ~ Discovery_clean, scales='free', space = 'free_x') +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(
        axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),  # Increase x-axis labels
        legend.position = "top",
        legend.key.spacing.x = unit(2, "cm"),
        legend.justification = "center"
    )
dev.off()

########
# Plot relative improvement of LEOPARD over IndivTune of SumStatTune scores
########

# meta res
meta_res_comp_ref <- meta_res_comp[meta_res_comp$Model2 == 'Multi-SumStatTune',]
meta_res_comp_ref <- meta_res_comp_ref[meta_res_comp_ref$Method1 != 'all' & meta_res_comp_ref$Method2 != 'all',]
meta_res_comp_ref <- meta_res_comp_ref[meta_res_comp_ref$Model1 == 'SumStatTune' & meta_res_comp_ref$Source1 == 'Multi',]
meta_res_comp_ref <- meta_res_comp_ref[gsub('_multi','', meta_res_comp_ref$Method1) == gsub('_multi','', meta_res_comp_ref$Method2),]

meta_res_comp_ref$R_diff_perc <- meta_res_comp_ref$R_diff / meta_res_comp_ref$Model_2_R
meta_res_comp_ref$R_diff_perc_SE <- meta_res_comp_ref$R_diff_SE / meta_res_comp_ref$Model_2_R

meta_res_comp_ref$Discovery_clean <- paste0(meta_res_comp_ref$Discovery1,' GWAS')
meta_res_comp_ref$Discovery_clean[meta_res_comp_ref$Target != 'EUR'] <- 'EUR + Target-matched GWAS'

meta_res_comp_ref <- merge(meta_res_comp_ref, pgs_method_labels, by.x = 'Method1', by.y = 'method', all.x = T)
meta_res_comp_ref$label[grepl('Multi', meta_res_comp_ref$Model1) & !(meta_res_comp_ref$Method1 %in% pgs_group_methods)] <- paste0(meta_res_comp_ref$label[grepl('Multi', meta_res_comp_ref$Model1) & !(meta_res_comp_ref$Method1 %in% pgs_group_methods)], '-multi')
meta_res_comp_ref$label <- factor(meta_res_comp_ref$label, levels = model_order)

meta_res_comp_ref$Target_clean <- paste0(meta_res_comp_ref$Target,' Target')

# trait-specific res
res_comp_all_ref <- res_comp_all[res_comp_all$Model2 == 'Multi-SumStatTune',]
res_comp_all_ref <- res_comp_all_ref[res_comp_all_ref$Method1 != 'all' & res_comp_all_ref$Method2 != 'all',]
res_comp_all_ref <- res_comp_all_ref[res_comp_all_ref$Model1 == 'SumStatTune' & res_comp_all_ref$Source1 == 'Multi',]
res_comp_all_ref <- res_comp_all_ref[gsub('_multi','', res_comp_all_ref$Method1) == gsub('_multi','', res_comp_all_ref$Method2),]

res_comp_all_ref$R_diff_perc <- res_comp_all_ref$R_diff / res_comp_all_ref$Model_2_R
res_comp_all_ref$R_diff_perc_SE <- res_comp_all_ref$R_diff_SE / res_comp_all_ref$Model_2_R

res_comp_all_ref$Discovery_clean <- paste0(res_comp_all_ref$Discovery1,' GWAS')
res_comp_all_ref$Discovery_clean[res_comp_all_ref$Target != 'EUR'] <- 'EUR + Target-matched GWAS'

res_comp_all_ref <- merge(res_comp_all_ref, pgs_method_labels, by.x = 'Method1', by.y = 'method', all.x = T)
res_comp_all_ref$label[grepl('Multi', res_comp_all_ref$Model1) & !(res_comp_all_ref$Method1 %in% pgs_group_methods)] <- paste0(res_comp_all_ref$label[grepl('Multi', res_comp_all_ref$Model1) & !(res_comp_all_ref$Method1 %in% pgs_group_methods)], '-multi')
res_comp_all_ref$label <- factor(res_comp_all_ref$label, levels = model_order)

res_comp_all_ref$Target_clean <- paste0(res_comp_all_ref$Target,' Target')

tmp_meta<-meta_res_comp_ref
tmp_all<-res_comp_all_ref

tmp_meta<-tmp_meta[!(tmp_meta$Method1 %in% c('prscsx','xwing')),]
tmp_meta<-tmp_meta[tmp_meta$Target != 'EUR',]

tmp_all<-tmp_all[!(tmp_all$Method1 %in% c('prscsx','xwing')),]
tmp_all<-tmp_all[tmp_all$Target != 'EUR',]

library(ggrepel)

# plot
png('~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/plots/leopard_perc_improv.png', width = 1800, height = 1100, res= 300, units = 'px')

ggplot(tmp_meta, aes(x=label, y=R_diff_perc , fill = Model1)) +
    geom_hline(yintercept = 0) +
      geom_errorbar(
        aes(
          ymin = R_diff_perc - R_diff_perc_SE,
          ymax = R_diff_perc + R_diff_perc_SE
        ),
        width = 0,
        position = position_dodge(width = 0.7)
      ) +    
    geom_point(
      stat = "identity",
      position = position_dodge(0.7),
      size = 3,
      shape = 23
    ) +
    geom_vline(xintercept = seq(1.5, length(unique(tmp_meta$label))), linetype="dotted") +
    scale_y_continuous(labels = percent_format()) +
    labs(y = "Relative Difference (SE)", fill = NULL, colour = NULL, x = NULL) +
    facet_grid(. ~ Target_clean) +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(
        axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),  # Increase x-axis labels
        legend.position = 'none'
    )
dev.off()

# Now compare quickPRS-multi and prs-csx only with trait
tmp_meta<-meta_res_comp_ref
tmp_all<-res_comp_all_ref

tmp_meta<- tmp_meta[tmp_meta$Target != 'EUR' & tmp_meta$Method1 %in% c('quickprs_multi','prscsx'),]
tmp_all<- tmp_all[tmp_all$Target != 'EUR' & tmp_all$Method1 %in% c('quickprs_multi','prscsx'),]

library(ggrepel)

png('~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/plots/leopard_perc_improv_restricted.png', width = 1500, height = 1500, res= 300, units = 'px')
ggplot(tmp_meta, aes(x=label, y=R_diff_perc , fill = Model1)) +
    geom_point(
        data = tmp_all,
        mapping = aes(x=label, y=R_diff_perc, colour=Model1),
        position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.7),
        alpha = 0.3
      ) +
    geom_hline(yintercept = 0) +
      geom_errorbar(
        aes(
          ymin = R_diff_perc - R_diff_perc_SE,
          ymax = R_diff_perc + R_diff_perc_SE
        ),
        width = 0,
        position = position_dodge(width = 0.7)
      ) +    
    geom_point(
      stat = "identity",
      position = position_dodge(0.7),
      size = 3,
      shape = 23
    ) +
    scale_y_continuous(labels = percent_format()) +
    labs(y = "Relative Difference (SE)", fill = NULL, colour = NULL, x = NULL) +
    facet_grid(. ~ Target_clean) +
    theme_half_open() +
    background_grid(major = 'y', minor = 'y') + 
    panel_border() + 
    theme(
        axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),  # Increase x-axis labels
        legend.position = 'none'
    ) +
    geom_text_repel(
      data = tmp_all[
        tmp_all$R_diff_perc < -0.05,
      ],
      aes(label = pheno),  # label as percent with 1 decimal
      position = position_dodge(width = 0.7),
      size = 3,
      min.segment.length = 0,
      segment.color = NA,
      show.legend = FALSE
    )
dev.off()

Descriptive statistics

Create a table showing descriptive statistics for the MVP sumstats. This should include LDSC SNP-heritability and AVENGEME results.

LDSC

Show code

conda activate ldsc

for pheno in $(cat /users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/trait_subset.txt); do
  mkdir -p /users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/ldsc/sumstats

  sbatch --mem 10G -n 1 -p neurohack_cpu --wrap="/users/k1806347/oliverpainfel/Software/ldsc/munge_sumstats.py \
   --sumstats /users/k1806347/oliverpainfel/Data/ukb/GenoPred/output/reference/gwas_sumstat/${pheno}_MVP_AFR/${pheno}_MVP_AFR-cleaned.gz \
   --out /users/k1806347/oliverpainfel/Data/ukb/GenoPred/output/reference/gwas_sumstat/${pheno}_MVP_AFR/${pheno}_MVP_AFR"
done

for pheno in $(cat /users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/trait_subset.txt); do
  mkdir -p /users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/ldsc/results/${pheno}/MVP_AFR

  sbatch --mem 10G -n 1 -p neurohack_cpu --wrap="/users/k1806347/oliverpainfel/Software/ldsc/ldsc.py \
   --h2 /users/k1806347/oliverpainfel/Data/ukb/GenoPred/output/reference/gwas_sumstat/${pheno}_MVP_AFR/${pheno}_MVP_AFR.sumstats.gz \
   --ref-ld /users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline/resources/data/ld_scores/UKBB.AFR.rsid \
   --w-ld /users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline/resources/data/ld_scores/UKBB.AFR.rsid \
   --out /users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/ldsc/results/${pheno}/MVP_AFR/res"
done

library(data.table)
library(ggplot2)
library(cowplot)

# Read in phenotypes
pheno_intersect <- read.table('/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/trait_subset.txt', header=F)$V1

# Plot the heritability estimates
h2_res <- NULL

for(pheno in pheno_intersect){
  log <-
    readLines(
      paste0(
        '/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/ldsc/results/',
        pheno,
        '/',
        'MVP_AFR',
        '/res.log'
      )
    )
  
  h2 <- log[grepl('Total Observed scale h2:', log)]
  h2_est <- as.numeric(gsub(' .*','', gsub('Total Observed scale h2: ', '', h2)))
  h2_se <- as.numeric(gsub("\\)",'', gsub(".* \\(", '', h2)))
  int <- log[grepl('Intercept:', log)]
  int_est <- as.numeric(gsub(' .*','', gsub('Intercept: ', '', int)))
  int_se <- as.numeric(gsub("\\)",'', gsub(".* \\(", '', int)))
  lambda <- log[grepl('Lambda GC:', log)]
  lambda <- as.numeric(gsub('.* ','', lambda))
  
  h2_res <- rbind(
    h2_res,
    data.table(
      Population = 'AFR',
      Phenotype = pheno,
      h2_est = h2_est,
      h2_se = h2_se,
      int_est = int_est,
      int_se = int_se,
      lambda = lambda
    )
  )
}


write.csv(h2_res, '/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/ldsc/results.csv', row.names = F, quote = F)

AVENGEME

Show code

Create predictor list

setwd('~/oliverpainfel/Software/MyGit/GenoPred/pipeline/')
source('../functions/misc.R')
source_all('../functions')
library(data.table)

# Get some key variables from config
config<-'/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_mvp.yaml'
pgs_methods <- read_param(config = config, param = 'pgs_methods', return_obj = F)
outdir <- read_param(config = config, param = 'outdir', return_obj = F)

# Get a list of score files
scores <- list_score_files(config)

# Read in phenotypes
pheno_intersect <- fread('/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/trait_subset.txt', header=F)$V1

# Create files for EAS and AFR targets
pop <- c('AFR')
for(trait_i in pheno_intersect){
  # Make a group containing both GWAS for each single source method
  # Make a group for each multisource method
  scores_i <- scores[grepl(paste0('^', trait_i, '_'), scores$name),]
  scores_i$group <- scores_i$method
  
  for(pop_i in pop){
    # Subset GWAS based on EUR and/or targ_pop_i
    samp_i <- 'MVP_AFR'

    dir.create(
      paste0(
        '/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/targ_',
        pop_i,
        '.disc_',
        pop_i,
        '/',
        trait_i
      ),
      recursive = T
    )
    
    scores_i_j <- scores_i[grepl(samp_i, scores_i$name, ignore.case = T),]
    scores_i_j <- scores_i_j[!grepl('UKB', scores_i_j$name),]
    scores_i_j <- scores_i_j[scores_i_j$method == 'ptclump',]
    scores_i_j$predictor <- paste0(
      outdir,
      '/ukb/pgs/TRANS/',
      scores_i_j$method,
      '/',
      scores_i_j$name,
      '/ukb-',
      scores_i_j$name,
      '-TRANS.profiles'
    )
    
    predictors_i <- scores_i_j[, c('predictor', 'group'), with=F]
    
    write.table(
      predictors_i,
      paste0(
        '/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/targ_',
        pop_i,
        '.disc_',
        pop_i,
        '/',
        trait_i,
        '/predictor_list.ptclump.txt'
      ),
      col.names = T,
      row.names = F,
      quote = F
    )
  }
}

Run model_builder

cd /users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline
conda activate model_builder

for pop in $(echo AFR); do
  if [ "$pop" == "EUR" ]; then
      pop2="EUR_test"
  else
      pop2=$pop
  fi
  
  for pheno in $(cat /users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/trait_subset.txt); do
    sbatch --mem 5G -n 5 -p neurohack_cpu --wrap="Rscript ../Scripts/model_builder/model_builder.R \
      --outcome /users/k1806347/oliverpainfel/Data/ukb/phenotypes/prscsx/${pheno}.unrel.${pop2}.row_number.txt \
      --predictors /users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/targ_${pop}.disc_${pop}/${pheno}/predictor_list.ptclump.txt \
      --out /users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/targ_${pop}.disc_${pop}/${pheno}/res.ptclump \
      --n_core 5 \
      --all_model F \
      --assoc T"
  done
done

Plot pT+clump association results

setwd('/users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline/')

library(data.table)
library(ggplot2)
library(cowplot)
library(avengeme)

source('../functions/misc.R')
source_all('../functions')

# Get some key variables from config
config<-'/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_mvp.yaml'
outdir <- read_param(config = config, param = 'outdir', return_obj = F)
gwas_list <- read_param(config = config, param = 'gwas_list', return_obj = T)

# Read in phenotypes
pheno_intersect <- read.table('/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/trait_subset.txt', header=F)$V1

pop = c('AFR')

mod_res_all <- NULL
for(pop_i in pop){
  for(pheno_i in pheno_intersect){
    gwas_i<-gwas_list$name[gwas_list$population == pop_i & grepl(paste0('^', pheno_i, '_'),  gwas_list$name)]
      
    res_i <-
      fread(
        paste0(
          '/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/targ_',
          pop_i,
          '.disc_',
          pop_i,
          '/',
          pheno_i,
          '/res.ptclump.assoc.txt'
        )
      )
    
    res_i$Z <- res_i$BETA / res_i$SE
    
    res_i$pT <- as.numeric(gsub('e.','e-', gsub('.*UKB\\.0\\.|.*BBJ\\.0\\.|.*UGR\\.0\\.|.*MVP.AFR\\.0\\.', '', res_i$Predictor)))

    nsnp_log <-
      read.table(
        paste0(
          outdir,
          '/reference/pgs_score_files/ptclump/',
          gwas_i,
          '/ref-',
          gwas_i,
          '.NSNP_per_pT'
        ),
        header = T
      )
    
    nsnp<-nsnp_log$NSNP[nrow(nsnp_log)]
    
    disc_N <-
      median(
        fread(
          paste0(
            outdir,
            '/reference/gwas_sumstat/',
            gwas_i,
            '/',
            gwas_i,
            '-cleaned.gz'
          ), nrows = 10000
        )$N
      )
    
    targ_N <- res_i$N[1]
    
    mod_res <- estimatePolygenicModel(
      p = res_i$Z,
      nsnp = nsnp,
      n = c(disc_N, targ_N),
      pupper = c(0, res_i$pT),
      fixvg2pi02 = T,
      alpha = 0.05
    )
    
    mod_res_all <- rbind(
      mod_res_all,
      data.frame(
        Phenotype = pheno_i,
        Population = pop_i,
        GWAS = gwas_i,
        nsnp = nsnp,
        max_r2 = max(res_i$Obs_R2),
        n_disc = disc_N,
        n_targ = targ_N,
        vg_est = mod_res$vg[1],
        vg_lowCI = mod_res$vg[2],
        vg_highCI = mod_res$vg[3],
        pi0_est = mod_res$pi0[1],
        pi0_lowCI = mod_res$pi0[2],
        pi0_highCI = mod_res$pi0[3]
      )
    )
  }
}

dir.create('/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/avengeme')
write.csv(mod_res_all, '/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/avengeme/results.csv', row.names = F, quote = F)

mod_res_all<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/avengeme/results.csv')

png('~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/plots/avengeme_h2.png', res = 100, width = 900, height = 500, units = 'px')
ggplot(mod_res_all, aes(x = Phenotype, y = vg_est, fill = Population)) +
  geom_bar(stat="identity", position=position_dodge(preserve = "single"), width = 0.7) +
  geom_errorbar(aes(ymin=vg_lowCI, ymax=vg_highCI), width=.2, position=position_dodge(width = 0.7, preserve = "single")) +
  labs(y="SNP-based Heritability (95%CI)", fill = NULL) +
  theme_half_open() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "top",
        legend.key.spacing.x = unit(1, "cm"),
        legend.justification = "center") +
  background_grid(major = 'y', minor = 'y')
dev.off()

png('~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/plots/avengeme_polygenicity.png', res = 100, width = 900, height = 500, units = 'px')
ggplot(mod_res_all, aes(x = Phenotype, y = 1 - pi0_est, fill = Population)) +
  geom_bar(stat="identity", position=position_dodge(preserve = "single"), width = 0.7) +
  geom_errorbar(aes(ymin=1 - pi0_lowCI, ymax=1 - pi0_highCI), width=.2, position=position_dodge(width = 0.7, preserve = "single")) +
  labs(y="Proporition non-zero\neffects (95%CI)", fill = NULL) +
  theme_half_open() +
  coord_cartesian(ylim = c(0, 0.15)) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "top",
        legend.key.spacing.x = unit(1, "cm"),
        legend.justification = "center") +
  background_grid(major = 'y', minor = 'y')
dev.off()

summary(mod_res_all$max_r2)
summary(mod_res_all$max_r2[mod_res_all$Population == 'EUR'])
summary(mod_res_all$max_r2[mod_res_all$Population == 'EAS'])
summary(mod_res_all$max_r2[mod_res_all$Population == 'AFR'])

Make table

Make a table showing GWAS information for the manuscript.

Show code

library(data.table)

#####
# Trait names, labels, and URLs
#####

mvp <- fread('~/oliverpainfel/Data/GWAS_sumstats/MVP/AFR/info.txt')
mvp <- mvp[, c('efoTraits','labels','url'), with=F]
names(mvp) <- c('trait', 'labels','url')
mvp$sample <- 'MVP'
mvp$population <- 'AFR'

#####
# Sample size, SNP-h2 and polygenicity
#####

# Read in the AVENGEME and LDSC results
avengeme <- fread('/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/avengeme/results.csv')
ldsc <- fread('/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/ldsc/results.csv')
both <- merge(avengeme, ldsc, by = c('Population','Phenotype'))

# Format for descriptives table
both$h2_avengeme<- paste0(
  round(both$vg_est,2), 
  " (95%CI = ", 
  round(both$vg_lowCI, 2), 
  " - " , 
  round(both$vg_highCI, 2), ")")

both$pi0_avengeme <- paste0(
  round(both$pi0_est,2), 
  " (95%CI = ", 
  round(both$pi0_lowCI, 2), 
  " - " , 
  round(both$pi0_highCI, 2), ")")

both$h2_ldsc <- paste0(
  round(both$h2_est,2), 
  " (SE = ", 
  round(both$h2_se, 2), 
  ")")

both$int_ldsc <- paste0(
  round(both$int_est,2), 
  " (SE = ", 
  round(both$int_se, 2), 
  ")")

both<-both[, c('Population','Phenotype','n_disc','n_targ','h2_avengeme','pi0_avengeme','h2_ldsc','int_ldsc','lambda'), with = F]
names(both)[1:2]<-c('population','labels')

info_all <- merge(mvp, both, by = c('labels','population'))
info_all$n_disc<-round(info_all$n_disc, 0)
info_all$n_targ<-round(info_all$n_targ, 0)

info_all<-info_all[, c('labels','trait','population','sample','n_disc','n_targ','h2_avengeme','pi0_avengeme','h2_ldsc','int_ldsc','lambda','url'), with=F]
names(info_all) <- c('Trait Label', 'Trait Description', 'Ancestry', 'GWAS Sample', 'GWAS N', 'Target N',"SNP-h2 (AVENGEME)","pi0 (AVENGEME)","SNP-h2 (LDSC)","Intercept (LDSC)",'Lambda', 'URL')

write.csv(info_all, '/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/gwas_descriptives.csv', row.names=F)

# Estimate the mean and SD of sample size within each population for selected traits
info_all_selected<-info_all
n_dat <- NULL
for(i in unique(info_all_selected$`GWAS Sample`)){
  n_dat <-rbind(
    n_dat,
    data.table(
      sample = i,
      gwas_n_median = round(median(info_all_selected$`GWAS N`[info_all_selected$`GWAS Sample` == i])),
      gwas_n_mean = round(mean(info_all_selected$`GWAS N`[info_all_selected$`GWAS Sample` == i])),
      gwas_n_sd = round(sd(info_all_selected$`GWAS N`[info_all_selected$`GWAS Sample` == i])),
      target_n_median = round(median(info_all_selected$`Target N`[info_all_selected$`GWAS Sample` == i])),
      target_n_mean = round(mean(info_all_selected$`Target N`[info_all_selected$`GWAS Sample` == i])),
      target_n_sd = round(sd(info_all_selected$`Target N`[info_all_selected$`GWAS Sample` == i]))
    )
  )
}

Show descriptives table

Trait Label	Trait Description	Ancestry	GWAS Sample	GWAS N	Target N	SNP-h2 (AVENGEME)	pi0 (AVENGEME)	SNP-h2 (LDSC)	Intercept (LDSC)	Lambda	URL
BMI	body mass index	AFR	MVP	118993	6646	0.12 (95%CI = 0.11 - 0.14)	0.96 (95%CI = 0.95 - 0.97)	0.37 (SE = 0.03)	1.32 (SE = 0.02)	1.6715	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST90475001-GCST90476000/GCST90475155/GCST90475155.tsv.gz
BWT	body weight	AFR	MVP	119279	6659	0.14 (95%CI = 0.13 - 0.16)	0.96 (95%CI = 0.95 - 0.97)	0.4 (SE = 0.03)	1.36 (SE = 0.03)	1.7334	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST90476001-GCST90477000/GCST90476462/GCST90476462.tsv.gz
HB	hemoglobin measurement	AFR	MVP	114985	6375	0.06 (95%CI = 0.05 - 0.07)	0.99 (95%CI = 0.99 - 0.99)	0.19 (SE = 0.02)	1.2 (SE = 0.02)	1.3685	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST90475001-GCST90476000/GCST90475375/GCST90475375.tsv.gz
HDL	high density lipoprotein cholesterol measurement	AFR	MVP	113085	5863	0.11 (95%CI = 0.1 - 0.12)	0.99 (95%CI = 0.99 - 0.99)	0.32 (SE = 0.04)	1.3 (SE = 0.02)	1.5071	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST90475001-GCST90476000/GCST90475351/GCST90475351.tsv.gz
HT	body height	AFR	MVP	119012	6658	0.14 (95%CI = 0.12 - 0.15)	0.96 (95%CI = 0.95 - 0.97)	0.93 (SE = 0.08)	1.69 (SE = 0.06)	2.2871	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST90475001-GCST90476000/GCST90475361/GCST90475361.tsv.gz
MCHC	mean corpuscular hemoglobin concentration	AFR	MVP	114851	6375	0.09 (95%CI = 0.08 - 0.1)	0.99 (95%CI = 0.99 - 0.99)	0.27 (SE = 0.1)	1.32 (SE = 0.03)	1.4034	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST90475001-GCST90476000/GCST90475457/GCST90475457.tsv.gz
NEU	neutrophil count	AFR	MVP	73258	6353	0.36 (95%CI = 0.34 - 0.38)	0.97 (95%CI = 0.96 - 0.97)	2.32 (SE = 1.19)	1.43 (SE = 0.07)	1.4459	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST90475001-GCST90476000/GCST90475528/GCST90475528.tsv.gz
PLT	platelet count	AFR	MVP	114731	6375	0.1 (95%CI = 0.09 - 0.11)	0.99 (95%CI = 0.99 - 0.99)	0.38 (SE = 0.03)	1.38 (SE = 0.02)	1.6639	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST90476001-GCST90477000/GCST90476298/GCST90476298.tsv.gz
SBP	systolic blood pressure	AFR	MVP	119331	6658	0.06 (95%CI = 0.05 - 0.07)	0.99 (95%CI = 0.98 - 0.99)	0.19 (SE = 0.02)	1.2 (SE = 0.01)	1.3964	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST90476001-GCST90477000/GCST90476402/GCST90476402.tsv.gz
TC	total cholesterol measurement	AFR	MVP	112265	6324	0.1 (95%CI = 0.09 - 0.12)	0.99 (95%CI = 0.99 - 0.99)	0.2 (SE = 0.04)	1.25 (SE = 0.02)	1.3615	http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST90476001-GCST90477000/GCST90476423/GCST90476423.tsv.gz

LEOPARD+QuickPRS

Here we will compare the LEOPARD estimated weights for population specific PGS, to the weights estimated using observed data in the UKB target sample.

Show code

setwd('/users/k1806347/oliverpainfel/Software/MyGit/GenoPred/pipeline/')

library(data.table)
library(ggplot2)
library(cowplot)

source('../functions/misc.R')
source_all('../functions')

# Read in list of outcomes 
selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', header=F)$V1

# Get some key variables from config
config<-'/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_mvp.yaml'
outdir <- read_param(config = config, param = 'outdir', return_obj = F)

# Get a list of score files
scores <- list_score_files(config)

###
# Read in weights estimated by LEOPARD (QuickPRS)
###

leopard_weights<-NULL
scores_quickprs <- scores$name[scores$method == 'quickprs_multi']
for(i in selected_traits){
  scores_i <- scores_quickprs[grepl(paste0('^', i,'_'), scores_quickprs)]
  for(j in scores_i){
      weights_file <- readRDS(paste0(outdir, '/reference/pgs_score_files/leopard/', j, '/ref-', j, '.weights.rds'))
      weights_file <- data.frame(weights_file)
      
      weights <-
        data.table(
          Target = do.call(c, lapply(names(weights_file), function(x) rep(x, 2))),
          Discovery = names(weights_file),
          Weight = do.call(c, lapply(weights_file, function(x) x)),
          Trait = i,
          Method = 'LEOPARD'
        )
      
      leopard_weights <- rbind(leopard_weights, weights)
  }
}

#####
# Read in the PGS weights estimated using UKB data
#####
# Read in the final model coefficients for multi-source methods

obs_weights<-NULL
for(method_i in unique(scores$method)[!(unique(scores$method) %in% pgs_group_methods)]){
  scores_method<-scores$name[scores$method == method_i]
  method_i <- gsub('_multi','', method_i)

  for(i in selected_traits){
    for(j in c('AFR')){
      if(j == 'EUR'){
        pops <- c('EAS','AFR')
      } else {
        pops <- j
      }
      
      for(k in pops){
        model <- fread(paste0('~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/targ_', j, '.disc_EUR_', k, '/', i, '/final_models/', method_i, '.pseudo.multi.final_model.txt'))
        model<-model[-1,]
        
        # Set weight to zero if negative, as this is what LEOPARD does
        if(any(model$V2 < 0)){
          model$V2[model$V2 < 0] <- 0
          model$V2[model$V2 > 0] <- 1
        }
        
        names(model) <- c('x', 'BETA')
        model$Discovery[grepl('UKB', model$x)]<-'EUR'
        model$Discovery[grepl('MVP', model$x)]<-'AFR'
        model$Target <- j
        model$Weight <- model$BETA/sum(model$BETA)
        model$Trait <- i
        model$Method <- method_i
        model<-model[,c('Target','Discovery','Weight','Method','Trait'), with=F]
        obs_weights<-rbind(obs_weights, model)
      }
    }
  }
}

###
# Estimate weights if using the inverse variance weighting
###

# Read in GWAS descriptives
gwas_desc<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/gwas_descriptives.csv')
gwas_desc <- gwas_desc[, c('Trait Label','Ancestry','GWAS N'), with=F]
names(gwas_desc)<-c('trait','ancestry','n')
gwas_desc<-gwas_desc[gwas_desc$trait %in% selected_traits,]

gwas_desc <- gwas_desc[gwas_desc$ancestry == 'EUR',]

gwas_desc_mvp <- fread('/users/k1806347/oliverpainfel/Analyses/crosspop/sensitivity_mvp/gwas_descriptives.csv')
gwas_desc_mvp <- gwas_desc_mvp[, c('Trait Label','Ancestry','GWAS N'), with=F]
names(gwas_desc_mvp)<-c('trait','ancestry','n')
gwas_desc_mvp<-gwas_desc_mvp[gwas_desc_mvp$trait %in% selected_traits,]

gwas_desc<-rbind(gwas_desc, gwas_desc_mvp)

library(dplyr)
library(tidyr)

# Reshape GWAS table to wide format
wide_gwas <- gwas_desc %>%
  pivot_wider(names_from = ancestry, values_from = n, values_fill = 0)

# Function to create rows for each pair
make_weights_long <- wide_gwas %>%
  rowwise() %>%
  do({
    trait <- .$trait
    eur <- .$EUR
    afr <- .$AFR
    eas <- .$EAS
    
    tibble(
      Trait = trait,
      Method = "inverse_var",
      Target = c("AFR", "AFR", "EUR", "EUR"),
      Discovery = c("EUR", "AFR", "EUR", "AFR"),
      Weight = c(
        eur / (eur + afr), afr / (eur + afr),  # AFR target
        eur / (eur + afr), afr / (eur + afr)  # EUR target (vs AFR)
      )
    )
  }) %>%
  bind_rows()

###
# Combine and compare
###

both <- do.call(rbind, list(obs_weights, leopard_weights, make_weights_long))

# Remove ptclump as it doesn't have a sumstattune method
both <- both[both$Method != 'ptclump',]

both<-merge(both, pgs_method_labels, by.x = 'Method', by.y = 'method', all.x=T, sort = F)
both$label[is.na(both$label)] <- both$Method[is.na(both$label)]
both$label <- factor(both$label, levels=unique(both$label))

# Plot non-EUR target first
tmp <- both[both$Target != 'EUR',]
tmp <- tmp[tmp$Discovery != 'EUR',]

# Set LEOPARD to black fill
default_colors <- hue_pal()(10)
names(default_colors) <- levels(tmp$label)
default_colors["LEOPARD"] <- "black"

# Plot the estimated and observed weights
png('~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/plots/leopard_weights.png', units = 'px', res = 300, width = 2500, height = 1500)
ggplot(tmp, aes(x = Trait, y = Weight, fill = label)) +
  geom_bar(width= 0.7, position=position_dodge(0.7), stat="identity", colour = 'black', size = 0.1) +
  scale_fill_manual(values = default_colors) +
  facet_grid(Target ~ .) +
  theme_half_open() +
  labs(title = 'Weight of target ancestry-matched PGS', fill = NULL) +
  background_grid(major = 'y', minor = 'y') + 
  panel_border() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ylim(c(0,1))
dev.off()

###
# Check calibration of LEOPARD compared to QuickPRS observed weights
###

tmp <- both[both$Target != 'EUR',]
tmp$Target<-NULL
tmp_wide <- reshape(tmp, 
                     idvar = c("Trait", "Discovery"), 
                     timevar = "label", 
                     direction = "wide")

names(tmp_wide) <- gsub('Weight.', '', names(tmp_wide))
tmp_wide<-tmp_wide[, !(grepl('Method', names(tmp_wide))), with = F]

tmp_wide_afr <- tmp_wide[tmp_wide$Discovery == 'AFR',]

# Calculate metrics
rmse_afr <- sqrt(mean((tmp_wide_afr$QuickPRS - tmp_wide_afr$LEOPARD)^2))
me_afr <- mean(tmp_wide_afr$QuickPRS - tmp_wide_afr$LEOPARD)

# Create annotation data.frame
metrics_df <- data.frame(
  Discovery = c("AFR"),
  x = c(0.5),         # Adjust positions as needed
  y = c(-0.05),
  label = c(
    paste0("RMSE = ", round(rmse_afr, 2), "\nME = ", round(me_afr, 2))
  )
)

png('~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/plots/leopard_weights_calibration.png', units = 'px', width = 2000, height = 2000, res = 300)
ggplot(tmp_wide[tmp_wide$Discovery != 'EUR',], aes(x = LEOPARD, y = QuickPRS)) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "grey40") +  # Perfect calibration
  geom_smooth(method = "lm", se = TRUE, colour = "blue") +  # Regression line
  geom_point(alpha = 0.7) +
  geom_text(data = metrics_df, aes(x = x, y = y, label = label), inherit.aes = FALSE, hjust = 0, size = 3.5) +
  labs(
    x = "LEOPARD weight",
    y = "Observed weight",
  ) +
  theme_half_open() +
  panel_border() + 
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
  ) +
  coord_fixed()    
dev.off()

###
# Check calibration of inverse_var compared to QuickPRS observed weights
###

tmp <- both[both$Target != 'EUR',]
tmp$Target<-NULL
tmp_wide <- reshape(tmp, 
                     idvar = c("Trait", "Discovery"), 
                     timevar = "label", 
                     direction = "wide")

names(tmp_wide) <- gsub('Weight.', '', names(tmp_wide))
tmp_wide<-tmp_wide[, !(grepl('Method', names(tmp_wide))), with = F]

tmp_wide_afr <- tmp_wide[tmp_wide$Discovery == 'AFR',]

# Calculate metrics
rmse_afr <- sqrt(mean((tmp_wide_afr$QuickPRS - tmp_wide_afr$inverse_var)^2))
me_afr <- mean(tmp_wide_afr$QuickPRS - tmp_wide_afr$inverse_var)


# Create annotation data.frame
metrics_df <- data.frame(
  Discovery = c("AFR"),
  x = c(0.5),         # Adjust positions as needed
  y = c(-0.05),
  label = c(
    paste0("RMSE = ", round(rmse_afr, 2), "\nME = ", round(me_afr, 2))
  )
)

png('~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/plots/inverse_var_weights_calibration.png', units = 'px', width = 2000, height = 2000, res = 300)
ggplot(tmp_wide[tmp_wide$Discovery != 'EUR',], aes(x = inverse_var, y = QuickPRS)) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "grey40") +  # Perfect calibration
  geom_smooth(method = "lm", se = TRUE, colour = "blue") +  # Regression line
  geom_point(alpha = 0.7) +
  geom_text(data = metrics_df, aes(x = x, y = y, label = label), inherit.aes = FALSE, hjust = 1.5, size = 3.5) +
  labs(
    x = "inverse_var weight",
    y = "Observed weight",
  ) +
  theme_half_open() +
  panel_border() + 
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
  ) +
  coord_fixed()    
dev.off()

###
# Check calibration of observed weights across all methods
###

tmp <- both[both$Target != 'EUR',]
tmp <- tmp[!(tmp$label %in% c('LEOPARD','inverse_var')),]
tmp$Target<-NULL
tmp_wide <- reshape(tmp, 
                     idvar = c("Trait", "Discovery"), 
                     timevar = "label", 
                     direction = "wide")

names(tmp_wide) <- gsub('Weight.', '', names(tmp_wide))
tmp_wide<-tmp_wide[, !(grepl('Method', names(tmp_wide))), with = F]

tmp_wide <- tmp_wide[tmp_wide$Discovery %in% c('AFR'),]

metrics <- NULL
for(i in c('AFR')){
  for(j in unique(tmp$label)){
    for(k in unique(tmp$label)){
      tmp_wide_i <- tmp_wide[tmp_wide$Discovery == i,]
      rmse <- sqrt(mean((tmp_wide_i[[j]] - tmp_wide_i[[k]])^2))
      me <- mean(tmp_wide_i[[j]] - tmp_wide_i[[k]])
      
      metrics <- rbind(
        metrics,
        data.frame(
          Population = i,
          Method1 = j,
          Method2 = k,
          rmse = rmse,
          me = me
        )
      )
    }
  }
}

png('~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/plots/observed_weights_calibration.png', units = 'px', width = 2000, height = 1650, res = 300)
ggplot(metrics, aes(x = Method1, y = Method2, fill = rmse)) +
  geom_tile(color = "white") +  # Tile plot with white borders
  geom_text(aes(label = round(rmse, 2)), color = "black") +  # Add correlation values
  scale_fill_gradient2(mid = "white", high = "red", midpoint = 0) +  # Color scale
  theme_half_open() +
  panel_border() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title = element_blank()
  ) +
  labs(fill = "RMSE")
dev.off()

# Calculate average RMSE for each method against all other methods
metrics_unique <- metrics[metrics$Method1 != metrics$Method2, ]
metrics_unique$Comparison <- NA
for (i in 1:nrow(metrics_unique)) {
  metrics_unique$Comparison[i] <-
    paste0(sort(c(
      metrics_unique$Method1[i], metrics_unique$Method2[i]
    )), collapse = ' vs. ')
}
metrics_unique <- metrics_unique[!duplicated(paste0(metrics_unique$Population, metrics_unique$Comparison)),]

mean_rmse <- NULL
for(i in unique(tmp$label)){
  for(j in c('AFR')){
    metrics_unique_tmp <- metrics_unique[metrics_unique$Method1 == i | metrics_unique$Method2 == i,]
    metrics_unique_tmp <- metrics_unique_tmp[metrics_unique_tmp$Population == j,]
    mean_rmse <- rbind(
      mean_rmse, 
      data.frame(
        Method = i,
        Population = j,
        avg_rmse = mean(metrics_unique_tmp$rmse)
      )
    )
  }
}

png('~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/plots/avg_observed_weight_rmse.png', units = 'px', width = 1500, height = 800, res = 300)
ggplot(mean_rmse, aes(x = Method, y = avg_rmse, fill = Method)) +
  geom_bar(width= 0.7, position=position_dodge(0.7), stat="identity", size = 0.1) +
  geom_text(aes(label = round(avg_rmse, 3)),  # <-- Add this
          vjust = 1.5,                    # <-- Move the text slightly above the bar
          size = 3) +                      # <-- Adjust text size
  scale_fill_manual(values = default_colors) +
  theme_half_open() +
  background_grid(major = 'y', minor = 'y') + 
  panel_border() +
  labs(y = 'Average RMSE') +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position="none")
dev.off()

###
# Check calibration of estimated (LEOPARD and inverse_var) weights compared to observed QuickPRS weights
###

tmp <- both[both$Target != 'EUR',]
tmp <- tmp[(tmp$label %in% c('LEOPARD','inverse_var','QuickPRS')),]
tmp$Target<-NULL
tmp_wide <- reshape(tmp, 
                     idvar = c("Trait", "Discovery"), 
                     timevar = "label", 
                     direction = "wide")

names(tmp_wide) <- gsub('Weight.', '', names(tmp_wide))
tmp_wide<-tmp_wide[, !(grepl('Method', names(tmp_wide))), with = F]

tmp_wide <- tmp_wide[tmp_wide$Discovery %in% c('EAS','AFR'),]

metrics <- NULL
for(i in c('AFR')){
  for(j in unique(tmp$label)){
    for(k in unique(tmp$label)){
      tmp_wide_i <- tmp_wide[tmp_wide$Discovery == i,]
      rmse <- sqrt(mean((tmp_wide_i[[j]] - tmp_wide_i[[k]])^2))
      me <- mean(tmp_wide_i[[j]] - tmp_wide_i[[k]])
      
      metrics <- rbind(
        metrics,
        data.frame(
          Population = i,
          Method1 = j,
          Method2 = k,
          rmse = rmse,
          me = me
        )
      )
    }
  }
}

# Plot the rmse for LEOPARD and inverse_var predicting observed QuickPRS weight
metrics <- metrics[metrics$Method1 == 'QuickPRS',]
metrics <- metrics[metrics$Method2 != 'QuickPRS',]

png('~/oliverpainfel/Analyses/crosspop/sensitivity_mvp/plots/inverse_var_comp_rmse.png', units = 'px', width = 800, height = 1500, res = 300)
ggplot(metrics, aes(x = Method2, y = rmse, fill = Method2)) +
  geom_bar(width= 0.7, position=position_dodge(0.7), stat="identity", size = 0.1) +
  geom_text(aes(label = round(rmse, 3)),  # <-- Add this
          vjust = 1.5,                    # <-- Move the text slightly above the bar
          size = 3) +                      # <-- Adjust text size
  theme_half_open() +
  background_grid(major = 'y', minor = 'y') + 
  panel_border() +
  labs(y = 'RMSE relative to QuickPRS', x = 'Method') +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position="none")
dev.off()

Check genetic distances

We will use two approaches to compare the ancestry within the GWAS and LD reference samples: 1. Project reference genetic principal components into the GWAS samples 2. Estimate ancestral proportions within GWAS and LD reference samples using bigsnpr

Compare genetic principal components

This involves using the allele frequencies within the GWAS summary statistics to estimate the mean of reference genetic principal components in the GWAS samples.

Show code

# Read in reference PC SNP-weights
pc_score_file <- fread('~/oliverpainfel/Data/ukb/GenoPred/output/reference/pc_score_files/TRANS/ref-TRANS-pcs.eigenvec.var.gz')

# Read in allele frequencies for each reference population
ref_pop_freq <- list()
for(i in c('EUR','EAS','AFR','AMR','MID','CSA')){
  freq_i <- NULL
  for(j in 1:22){
    freq_i <- rbind(freq_i, fread(paste0('~/oliverpainfel/Software/MyGit/GenoPred/pipeline/resources/data/ref/freq_files/', i,'/ref.', i, '.chr', j, '.afreq')))
  }
  ref_pop_freq[[i]] <- freq_i
}

# Calculate PCs for reference populations
ref_pc_all <- NULL
for(i in c('EUR','EAS','AFR','AMR','MID','CSA')){
  # Merge on SNP ID
  merged <- merge(
    pc_score_file, 
    ref_pop_freq[[i]], 
    by.x = "SNP", 
    by.y = "ID"
  )
  
  # Flip frequencies where needed (if ALT != A1, assume ALT_FREQ = 1 - A1_FREQ)
  merged[, A1_FREQ := ifelse(ALT == A1, ALT_FREQS, 1 - ALT_FREQS)]
  
  # Calculate PCs using dot product: sum(allele_freq * SNP_weight) for each PC
  ref_PC <- merged[, lapply(.SD, function(w) sum(w * A1_FREQ, na.rm = TRUE)), 
                   .SDcols = patterns("^PC")]
  
  ref_pc_all <- rbind(ref_pc_all, data.frame(group = i,
                                             ref_PC))
}

# Read in GWAS allele frequencies for BMI
gwas_list_main <- fread('/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list.txt')
gwas_list_main <- gwas_list_main[grepl('BMI', gwas_list_main$name),]

gwas_list_mvp <- fread('/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_mvp.txt')
gwas_list_mvp <- gwas_list_mvp[grepl('BMI', gwas_list_mvp$name),]

gwas_list<-rbind(gwas_list_main, gwas_list_mvp)

outdir <- '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/output'
gwas_freq <- list()
for(i in 1:nrow(gwas_list)){
  ss_i <- fread(paste0(
            outdir,
            '/reference/gwas_sumstat/',
            gwas_list$name[i],
            '/',
            gwas_list$name[i],
            '-cleaned.gz'
          ))
  
  gwas_freq[[gwas_list$name[i]]] <- ss_i[, c('SNP','A1','A2','FREQ'), with = F]
}

gwas_pc_all <- NULL
for(i in names(gwas_freq)){
  # Merge on SNP ID
  merged <- merge(
    pc_score_file, 
    gwas_freq[[i]], 
    by = "SNP"
  )
  
  # Flip frequencies where needed (if ALT != A1, assume ALT_FREQ = 1 - A1_FREQ)
  merged[, FREQ := ifelse(A1.x == A1.y, FREQ, 1 - FREQ)]
  
  # Calculate PCs using dot product: sum(allele_freq * SNP_weight) for each PC
  ref_PC <- merged[, lapply(.SD, function(w) sum(w * FREQ, na.rm = TRUE)), 
                   .SDcols = patterns("^PC")]
  
  gwas_pc_all <- rbind(gwas_pc_all, data.frame(group = i,
                                             ref_PC))
}

ref_pc_all$type <- 'Reference'
gwas_pc_all$type <- 'GWAS'

pc_all <- rbind(ref_pc_all, gwas_pc_all)

ggplot(pc_all, aes(x=PC1, y = PC2, colour = group, shape = type)) +
  geom_point(size = 5)

# There is a big shift in the PCs for GWAS and reference data, probably due to missing variants. Restrict to variant present in all GWAS.

extract <- Reduce(intersect, list(
  pc_score_file$SNP,
  gwas_freq$BMI_UKB$SNP,
  gwas_freq$BMI_BBJ$SNP,
  gwas_freq$BMI_UGR$SNP,
  gwas_freq$BMI_MVP_AFR$SNP
))

pc_score_file_overlapping <- pc_score_file[pc_score_file$SNP %in% extract,]

# Calculate PCs for reference populations
ref_pc_all <- NULL
for(i in c('EUR','EAS','AFR','AMR','MID','CSA')){
  # Merge on SNP ID
  merged <- merge(
    pc_score_file_overlapping, 
    ref_pop_freq[[i]], 
    by.x = "SNP", 
    by.y = "ID"
  )
  
  # Flip frequencies where needed (if ALT != A1, assume ALT_FREQ = 1 - A1_FREQ)
  merged[, A1_FREQ := ifelse(ALT == A1, ALT_FREQS, 1 - ALT_FREQS)]
  
  # Calculate PCs using dot product: sum(allele_freq * SNP_weight) for each PC
  ref_PC <- merged[, lapply(.SD, function(w) sum(w * A1_FREQ, na.rm = TRUE)), 
                   .SDcols = patterns("^PC")]
  
  ref_pc_all <- rbind(ref_pc_all, data.frame(group = i,
                                             ref_PC))
}

gwas_pc_all <- NULL
for(i in names(gwas_freq)){
  # Merge on SNP ID
  merged <- merge(
    pc_score_file_overlapping, 
    gwas_freq[[i]], 
    by = "SNP"
  )
  
  # Flip frequencies where needed (if ALT != A1, assume ALT_FREQ = 1 - A1_FREQ)
  merged[, FREQ := ifelse(A1.x == A1.y, FREQ, 1 - FREQ)]
  
  # Calculate PCs using dot product: sum(allele_freq * SNP_weight) for each PC
  ref_PC <- merged[, lapply(.SD, function(w) sum(w * FREQ, na.rm = TRUE)), 
                   .SDcols = patterns("^PC")]
  
  gwas_pc_all <- rbind(gwas_pc_all, data.frame(group = i,
                                             ref_PC))
}

ref_pc_all$Population <- ref_pc_all$group
ref_pc_all$GWAS <- NA

gwas_pc_all$GWAS <- gwas_pc_all$group
gwas_pc_all$GWAS[gwas_pc_all$GWAS == 'BMI_UKB'] <- "UKB (EUR)"
gwas_pc_all$GWAS[gwas_pc_all$GWAS == 'BMI_BBJ'] <- "BBJ (EAS)"
gwas_pc_all$GWAS[gwas_pc_all$GWAS == 'BMI_UGR'] <- "UGR (AFR)"
gwas_pc_all$GWAS[gwas_pc_all$GWAS == 'BMI_MVP_AFR'] <- "MVP (AFR)"
gwas_pc_all$Population <- NA

ref_pc_all$type <- 'Reference'
gwas_pc_all$type <- 'GWAS'

pc_all <- rbind(ref_pc_all, gwas_pc_all)

png('~/oliverpainfel/Analyses/crosspop/plots/genetic_distance.png', units = 'px', width = 2000, height = 1500, res = 300)

ggplot(pc_all, aes(x=PC1, y = PC2)) +
  geom_point(data=pc_all[pc_all$type == 'Reference',], aes(x=PC1, y = PC2, fill = Population), size = 5, shape = 21) +
  geom_point(data=pc_all[pc_all$type == 'GWAS',], aes(x=PC1, y = PC2, colour = GWAS), size = 4, shape = 3, stroke = 1.5) +
  theme_half_open() +
  background_grid()

dev.off()

# Perfect. Shows UKB match EUR, BBJ match EAS, UGR matches AFR, but MVP is close to AFR.

Estimate ancestry proportions

This involves using Florian Prive’s bigsnpr packages and UKB reference data to calculate principal components and then estimate ancestral proportions.

Show code

library(bigsnpr)
library(dplyr)

DIR <- "~/oliverpainfel/Data/bigsnpr"  # you can replace by e.g. "data" or "tmp-data"

# /!\ This downloads 850 Mb (each)
all_freq <- bigreadr::fread2(
  runonce::download_file("https://figshare.com/ndownloader/files/31620968",
                         dir = DIR, fname = "ref_freqs.csv.gz"))
projection <- bigreadr::fread2(
  runonce::download_file("https://figshare.com/ndownloader/files/31620953",
                         dir = DIR, fname = "projection.csv.gz"))

# Read in allele frequencies for each reference population
ref_rds <- NULL
for(i in 1:22){
  ref_rds <- rbind(ref_rds, 
                   readRDS(paste0('~/oliverpainfel/Software/MyGit/GenoPred/pipeline/resources/data/ref/ref.chr', i, '.rds')))
}

ref_pop_freq <- list()
for(i in c('EUR','EAS','AFR','AMR','MID','CSA')){
  ref_pop_freq[[i]] <- ref_rds[, c('CHR','SNP','BP_GRCh37','A1','A2',paste0('REF.FRQ.',i)), with=F]
  names(ref_pop_freq[[i]]) <-c('chr','rsid','pos','a1','a0','freq')
}

# Read in GWAS allele frequencies for BMI
gwas_list_main <- fread('/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list.txt')
gwas_list_main <- gwas_list_main[grepl('BMI', gwas_list_main$name),]

gwas_list_mvp <- fread('/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_mvp.txt')
gwas_list_mvp <- gwas_list_mvp[grepl('BMI', gwas_list_mvp$name),]
gwas_list_mvp <- gwas_list_mvp[gwas_list_mvp$population == 'AFR',]

gwas_list<-rbind(gwas_list_main, gwas_list_mvp)

outdir <- '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/output'
gwas_freq <- list()
for(i in 1:nrow(gwas_list)){
  ss_i <- fread(paste0(
            outdir,
            '/reference/gwas_sumstat/',
            gwas_list$name[i],
            '/',
            gwas_list$name[i],
            '-cleaned.gz'
          ))
  
  gwas_freq[[gwas_list$name[i]]] <- ss_i[, c('CHR','BP','SNP','A2','A1','FREQ'), with = F]
  names(gwas_freq[[gwas_list$name[i]]]) <- c("chr", "pos", 'rsid', "a0", "a1", "freq")
}

# Harmonise the data
gwas_freq_matched <- list()
for(i in 1:nrow(gwas_list)){
  gwas_freq_matched[[gwas_list$name[i]]] <- snp_match(
    mutate(gwas_freq[[gwas_list$name[i]]], chr = as.integer(chr), beta = 1),
    all_freq[1:5]
  ) %>%
    mutate(freq = ifelse(beta < 0, 1 - freq, freq))
}

ref_pop_freq_matched <- list()
for(i in c('EUR','EAS','AFR','AMR','MID','CSA')){
  ref_pop_freq_matched[[i]] <- snp_match(
    mutate(ref_pop_freq[[i]], chr = as.integer(chr), beta = 1),
    all_freq[1:5]
  ) %>%
    mutate(freq = ifelse(beta < 0, 1 - freq, freq))
}

# Infer ancestry with shrinkage (https://privefl.github.io/bigsnpr/articles/ancestry.html)
correction <- c(1, 1, 1, 1.008, 1.021, 1.034, 1.052, 1.074, 1.099,
                1.123, 1.15, 1.195, 1.256, 1.321, 1.382, 1.443)

gwas_ancestry <- list()
for(i in 1:nrow(gwas_list)){
  res <- snp_ancestry_summary(
    freq = gwas_freq_matched[[gwas_list$name[i]]]$freq,
    info_freq_ref = all_freq[gwas_freq_matched[[gwas_list$name[i]]]$`_NUM_ID_`, -(1:5)],
    projection = projection[gwas_freq_matched[[gwas_list$name[i]]]$`_NUM_ID_`, -(1:5)],
    correction = correction
  )
  
  # Group similar popualtions
  group <- colnames(all_freq)[-(1:5)]
  group[group %in% c("Scandinavia", "United Kingdom", "Ireland")]   <- "Europe (North West)"
  group[group %in% c("Europe (South East)", "Europe (North East)")] <- "Europe (East)"
  grp_fct <- factor(group, unique(group))
  
  gwas_ancestry[[gwas_list$name[i]]] <- tapply(res, grp_fct, sum)
}

ref_ancestry <- list()
for(i in c('EUR','EAS','AFR','AMR','MID','CSA')){
  res <- snp_ancestry_summary(
    freq = ref_pop_freq_matched[[i]]$freq,
    info_freq_ref = all_freq[ref_pop_freq_matched[[i]]$`_NUM_ID_`, -(1:5)],
    projection = projection[ref_pop_freq_matched[[i]]$`_NUM_ID_`, -(1:5)],
    correction = correction
  )
  
  # Group similar popualtions
  group <- colnames(all_freq)[-(1:5)]
  group[group %in% c("Scandinavia", "United Kingdom", "Ireland")]   <- "Europe (North West)"
  group[group %in% c("Europe (South East)", "Europe (North East)")] <- "Europe (East)"
  grp_fct <- factor(group, unique(group))
  
  ref_ancestry[[i]] <- tapply(res, grp_fct, sum)
}

# Plot the results
gwas_ancestry_table <- do.call(cbind, gwas_ancestry)
colnames(gwas_ancestry_table) <- c("UKB (EUR)", "BBJ (EAS)", "UGR (AFR)", "MVP (AFR)")
gwas_ancestry_table_melt <- melt(gwas_ancestry_table)

gwas_ancestry_table_melt<-gwas_ancestry_table_melt[gwas_ancestry_table_melt$value >= 0.02,]

library(scales)  # for percent_format()

png('~/oliverpainfel/Analyses/crosspop/plots/gwas_ancestry.png', units = 'px', width = 2000, height = 1500, res = 300)
ggplot(gwas_ancestry_table_melt, aes(x = Var2, y = value, fill = Var1)) +
  geom_bar(stat = "identity", colour = "black", size = 0.2) +
  geom_text(data = subset(gwas_ancestry_table_melt, value >= 0.02),
            aes(label = scales::percent(value, accuracy = 1)),
            position = position_stack(vjust = 0.5),
            size = 3) +
  labs(x = "GWAS", y = "Proportion", fill = "Population",
       title = "Ancestry proportions in BMI GWAS") +
  theme_half_open() +
  theme(axis.text.x = element_text(angle = -45, hjust = 1))
dev.off()

write.csv(gwas_ancestry_table, '~/oliverpainfel/Analyses/crosspop/plots/gwas_ancestry.csv')

ref_ancestry_table <- do.call(cbind, ref_ancestry)
ref_ancestry_table_melt <- melt(ref_ancestry_table)

ref_ancestry_table_melt<-ref_ancestry_table_melt[ref_ancestry_table_melt$value >= 0.02,]

png('~/oliverpainfel/Analyses/crosspop/plots/ref_ancestry.png', units = 'px', width = 2000, height = 1500, res = 300)
ggplot(ref_ancestry_table_melt, aes(x = Var2, y = value, fill = Var1)) +
  geom_bar(stat = "identity", colour = 'black', size = 0.2) +
  geom_text(data = subset(ref_ancestry_table_melt, value >= 0.02),
            aes(label = scales::percent(value, accuracy = 1)),
            position = position_stack(vjust = 0.5),
            size = 3) +
  labs(x = "Reference Label", y = "Proportion", fill = 'Population', title = 'Ancestry proportions in 1KG+HGDP reference') +
  theme_half_open() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
dev.off()

write.csv(ref_ancestry_table, '~/oliverpainfel/Analyses/crosspop/plots/ref_ancestry.csv')

gwas_ancestry_table_melt$Type <- 'GWAS'
ref_ancestry_table_melt$Type <- 'Reference'

both<-rbind(gwas_ancestry_table_melt, ref_ancestry_table_melt)
png('~/oliverpainfel/Analyses/crosspop/plots/ancestry_composition.png', units = 'px', width = 3000, height = 1500, res = 300)

ggplot(both, aes(x = Var2, y = value, fill = Var1)) +
  geom_bar(stat = "identity", colour = 'black', size = 0.2) +
  geom_text(data = subset(both, value >= 0.02),
            aes(label = scales::percent(value, accuracy = 1)),
            position = position_stack(vjust = 0.5),
            size = 3) +
  labs(x = NULL, y = "Proportion", fill = 'Population') +
  facet_grid(. ~ Type, scales = 'free_x') +
  theme_half_open() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  panel_border()
dev.off()

BridgePRS

The below code is incomplete due to current compatability issues between BridgePRS and GenoPred. Work to incorporate BridgePRS into GenoPred is on going.

Show code

######
# gwas_list
######

library(data.table)

# Subset original gwas_list to include selected traits
gwas_list<-fread('/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_all.txt')
pheno<-gsub('_.*','', gwas_list$name)
selected_traits<-fread('/users/k1806347/oliverpainfel/Analyses/crosspop/trait_subset.txt', header=F)$V1
gwas_list<-gwas_list[pheno %in% selected_traits,]
gwas_list$label<-paste0('"', gwas_list$label, '"')

gwas_list<-gwas_list[grepl('BMI', gwas_list$name),]

write.table(
  gwas_list, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_bridge.txt', 
  col.names = T, 
  row.names = F, 
  quote = F)

######
# gwas_groups
######

gwas_groups_eas<-data.frame(
  name=paste0(selected_traits, '_UKB_BBJ'),
  gwas=sapply(selected_traits, function(x) paste0(x,'_UKB,',x,'_BBJ')),
  label=paste0('"', selected_traits, " (UKB+BBJ)", '"')
)

gwas_groups_afr<-data.frame(
  name=paste0(selected_traits, '_UKB_UGR'),
  gwas=sapply(selected_traits, function(x) paste0(x,'_UKB,',x,'_UGR')),
  label=paste0('"', selected_traits, " (UKB+UGR)", '"')
)

gwas_groups<-rbind(gwas_groups_eas, gwas_groups_afr)

gwas_groups<-gwas_groups[grepl('BMI', gwas_groups$name),]

write.table(gwas_groups, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_groups_bridge.txt', col.names = T, row.names = F, quote = F)

######
# config
######

config<-c(
  "outdir: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/output",
  "config_file: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_bridgeprs.yaml",
  "gwas_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_bridge.txt",
  "target_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/basic/target_list.txt",
  "gwas_groups: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_groups_bridge.txt",
  "pgs_methods: ['bridgeprs']",
  "cores_prep_pgs: 10", 
  "cores_target_pgs: 50"
)

write.table(config, '/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_bridgeprs.yaml', col.names = F, row.names = F, quote = F)

Run pipeline

snakemake \
  --profile slurm \
  --use-conda \
  --configfile=/users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_bridgeprs.yaml \
  target_pgs  -n