Run null simulations

Goal

Run null simulations to show trans-pco and the other two methods, i.e. minP and PC1, are well-calibrated.

Run three association tests on simulated null z-scores

First, load required packages.

rm(list = ls())
library(mvtnorm)
source("/home/liliw1/Trans/plot/theme_my_pub.R")

Give input files and parameters. To make it simple (and fast), here I simulate only \(10^3\) simulations (in the paper, \(10^7\) tests were simulated).

n_sim <- 10^3

# I/O & paras -----
dir_pco <- 'simulation/script_lambda0.1/'
file_Sigma <- '/project2/xuanyao/llw/simulation_lambda0.1/new_Sigma/Sigma-new_DGN_module29_K101.rds'

Read files and source pco test scripts.

# read files -----
# source pco test
source(paste0(dir_pco, "ModifiedPCOMerged.R"))
source(paste0(dir_pco, "liu.R"))
source(paste0(dir_pco, "liumod.R"))
source(paste0(dir_pco, "davies.R"))
dyn.load(paste0(dir_pco, "qfc.so"))
source(paste0(dir_pco, "ModifiedSigmaOEstimate.R"))

Sigma <- as.matrix(readRDS(file_Sigma))
K <- dim(Sigma)[1]

Simulate null z-scores using the given \(\Sigma\) of a gene module from real dataset.

# simulate null z-scores -----
set.seed(123)

p_null_all  <- list()
z_null <- rmvnorm(n_sim, rep(0, K), Sigma)

Run pco test and calculate p-values,

# prepare eigenvalues and eigenvectors as input for three methods -----
SigmaO <- ModifiedSigmaOEstimate(Sigma)
eigen_res <- eigen(Sigma)
eigen_lamb <- eigen_res$values
eigen_vec <- eigen_res$vectors

# PCO
p_null_all$'p.null.PCO' <- ModifiedPCOMerged(
  Z.mat = z_null, Sigma = Sigma, SigmaO = SigmaO
) |> as.numeric()

## PCMinP done. 
## PCFisher done. 
## PCLC done. 
## WI done. 
## Wald done. 
## VC done. 
## PCO done.

cat("PCO done. \n\n")

## PCO done.

Run pc1 test and calculate p-values,

# PC1
PC1 <- z_null %*% eigen_vec[, 1]
p_null_all$'p.null.PC1' <- 2*pnorm(-abs(PC1/sqrt(eigen_lamb[1])))|> as.numeric()

cat("PC1 done. \n\n")

## PC1 done.

Run minp test and calculate p-values,

# univariate minp
p_null_all$'p.null.minp' <- apply(z_null, 1, function(x) min(1-pchisq(x^2, 1))*length(x) )

cat("minp done. \n\n")

## minp done.

Finally, take a look at the calculated p-values on null z’s.

str(p_null_all)

## List of 3
##  $ p.null.PCO : num [1:1000] 0.867 0.561 0.287 0.312 0.724 ...
##  $ p.null.PC1 : num [1:1000] 0.349 0.633 0.771 0.162 0.679 ...
##  $ p.null.minp: num [1:1000] 1.313 0.871 1.747 0.457 1.101 ...

QQ-plot of null p-values by three association tests

First, load required packages.

rm(list = ls())
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.7
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

source("/home/liliw1/Trans/plot/theme_my_pub.R")

Files of simulated null p-values,

# I/O & paras -----
file_dat_null <- "/project2/xuanyao/llw/simulation_lambda0.1/new_Sigma/simulation.null.lambda0.1.K101.rds"
ci_level <- 0.95

# read files -----
p_null_all <- readRDS(file_dat_null)



# organize data -----
## reset obs p-values that are 0 to a fixed value, here I use the min non-zero p/10 -----
input <- as_tibble(p_null_all)
input[input == 0] <- min(input[input != 0])/10

## number of samples -----
n <- nrow(input)

## expected null p -----
expected <- seq(1, n) / (n+1)
lexp <- -log10(expected)

## order statistic of null p -----
ci_l <- -log10( qbeta(p = (1 - ci_level) / 2, shape1 = 1:n, shape2 = n:1) )
ci_r <- -log10( qbeta(p = (1 + ci_level) / 2, shape1 = 1:n, shape2 = n:1) )

## obs -----
observed <- apply(input, 2, sort) %>% as.data.frame()
lobs <- -log10(observed)


## take only a subset of null p's, to save image space -----
ind_sub <- c(
  1:sum(lexp > 4),
  seq(from = sum(lexp > 4), to = sum(lexp > 2), length.out = 2000) %>% ceiling(),
  seq(from = sum(lexp > 2), to = n, length.out = 2000) %>% ceiling()
)


# data for plt -----
df_plt <- cbind(data.frame(x = lexp, ci_l = ci_l, ci_r = ci_r), lobs) %>%
  slice(ind_sub) %>%
  pivot_longer(-c(x, ci_l, ci_r), names_to = "Type", values_to = "y")

## set group order in plt -----
group_order <- c("p.null.PCO", "p.null.PC1", "p.null.minp")
group_label <- c("Trans-PCO", "PC1", "MinP")

group_order <- if(is.null(group_order)) unique(df_plt$Type) else group_order
group_label <- if(is.null(group_label)) group_order else group_label
df_plt$Type <- factor(df_plt$Type, levels = group_order, labels = group_label)


# QQ-plot of null p-values by three association tests -----
base_plt <- ggplot(df_plt, aes(x = x, y = y, group = Type)) +
  geom_ribbon(aes(ymin = ci_l, ymax = ci_r), fill = "#e5e5e5", color = "#e5e5e5") +
  geom_abline(slope = 1, intercept = 0, color = "#595959", size = 0.7) +
  geom_point(aes(color = Type), size = 0.5) +
  labs(x = bquote(Expected -log[10]~italic((P))),
       y = bquote(Observed -log[10]~italic((P))),
       color = NULL)
base_plt +
  scale_color_manual(
    values = c("#85192d", "#0028a1", "#e89c31"),
    guide = guide_legend(override.aes = list(size = 2))
  ) +
  theme_my_pub() +
  theme(
    panel.grid.major.x = element_line(linetype = "dotted"),
    panel.grid.major.y = element_line(linetype = "dotted"),
    
    legend.background = element_blank(),
    legend.position = "right",
    
    axis.title = element_text(size = 14), 
    axis.text = element_text(colour = "black", size = 12)
  )

Session info

sessionInfo()

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /scratch/midway2/liliw1/conda_env/rstudio-server/lib/libopenblasp-r0.3.18.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] forcats_0.5.1   stringr_1.4.0   dplyr_1.0.7     purrr_0.3.4    
##  [5] readr_2.1.2     tidyr_1.2.0     tibble_3.1.7    ggplot2_3.3.6  
##  [9] tidyverse_1.3.1 mvtnorm_1.1-3  
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.8.3     lubridate_1.8.0  assertthat_0.2.1 digest_0.6.29   
##  [5] utf8_1.2.2       R6_2.5.1         cellranger_1.1.0 backports_1.4.1 
##  [9] reprex_2.0.1     evaluate_0.14    httr_1.4.2       highr_0.9       
## [13] pillar_1.7.0     rlang_1.0.3      readxl_1.3.1     rstudioapi_0.13 
## [17] jquerylib_0.1.4  rmarkdown_2.11   labeling_0.4.2   munsell_0.5.0   
## [21] broom_0.7.12     compiler_4.1.2   modelr_0.1.8     xfun_0.29       
## [25] pkgconfig_2.0.3  htmltools_0.5.2  tidyselect_1.1.1 fansi_1.0.3     
## [29] crayon_1.5.1     tzdb_0.2.0       dbplyr_2.1.1     withr_2.5.0     
## [33] grid_4.1.2       jsonlite_1.7.3   gtable_0.3.0     lifecycle_1.0.1 
## [37] DBI_1.1.3        magrittr_2.0.3   scales_1.2.0     cli_3.3.0       
## [41] stringi_1.7.6    farver_2.1.1     fs_1.5.2         xml2_1.3.3      
## [45] bslib_0.3.1      ellipsis_0.3.2   generics_0.1.2   vctrs_0.4.1     
## [49] tools_4.1.2      glue_1.6.2       hms_1.1.1        fastmap_1.1.0   
## [53] yaml_2.2.2       colorspace_2.0-3 rvest_1.0.2      knitr_1.37      
## [57] haven_2.4.3      sass_0.4.0