Goal

Annotate cis genes of the identified trans signals to help understand the mechanisms of trans regulation.

Find cis genes of trans signals

I looked at what genes are nearest or near (<1Mb) to each trans signals.

# I/O & paras -----
file_qtl <- '/project2/xuanyao/llw/DGN_no_filter_on_mappability/FDR/signals.chr.module.perm10.fdr10.txt'
file_gene_meta <- '/project2/xuanyao/data/mappability/gencode.v19.annotation.table.txt'
dis_cis <- 1e+6


# read files -----
qtl <- fread(file_qtl, header = FALSE, col.names = c("signal", "p", "q"))
gene_meta <- fread(file_gene_meta, header = TRUE)


# organize data -----
## extract signals' module, chr, pos -----
qtl <- qtl %>%
  separate("signal",
           into = c("module", "chr", "pos"),
           sep = ":", remove = FALSE, convert = TRUE) %>%
  unite(col = "SNP",
        c("chr", "pos"),
        sep = ":", remove = FALSE)

## extract protein_coding, lincRNA, auto-chr genes -----
gene_meta <- gene_meta %>%
  filter(Class %in% c("protein_coding", "lincRNA") & Chromosome %in% paste0("chr",1:22)) %>%
  separate("Chromosome", c(NA, "chr"), sep = "chr", remove = FALSE)


# nearest and cis genes of each signal, with distance -----
cis_gene_meta <- sapply(1:nrow(qtl), function(x){
  tmp_qtl = qtl[x, ]
  tmp_cis_gene_meta = gene_meta %>%
    filter(chr %in% tmp_qtl$chr) %>%
    mutate("dis" = abs(tmp_qtl$pos - Start)) %>%
    filter(dis < dis_cis/2) %>%
    arrange(dis)
  c(
    tmp_cis_gene_meta$GeneSymbol[1],
    tmp_cis_gene_meta$dis[1],
    paste(tmp_cis_gene_meta$GeneSymbol, collapse = ";"),
    paste(tmp_cis_gene_meta$dis, collapse = ";")
  )
})
cis_gene_meta <- as.data.table(t(cis_gene_meta))
colnames(cis_gene_meta) <- c("nearest_gene", "nearest_dis", "near_genes", "near_dis")


## add cis gene info to signals -----
qtl <- bind_cols(qtl, cis_gene_meta)

Let’s look at one example.

qtl <- fread('/project2/xuanyao/llw/DGN_no_filter_on_mappability/postanalysis/signal_cis_genes.txt')

Columns are,

colnames(qtl)
##  [1] "signal"       "module"       "SNP"          "chr"          "pos"         
##  [6] "p"            "q"            "nearest_gene" "nearest_dis"  "near_genes"  
## [11] "near_dis"

The last four columns give the nearest and near genes, and the distance between the trans-eQTL on each row and corresponding genes. For example,

knitr::kable(qtl[1, ])
signal module SNP chr pos p q nearest_gene nearest_dis near_genes near_dis
module1:4:6696460 module1 4:6696460 4 6696460 0 0 S100P 1664 S100P;AC093323.1;RP11-539L10.2;MRFAP1L1;AC093323.3;BLOC1S4;RP11-539L10.3;MRFAP1;KIAA0232;MAN2B2;TBC1D14;CCDC96;TADA2B;GRPEL1;PPP2R2C;RP11-367J11.3;WFS1;RP11-1406H17.1;RP11-586D19.2;RP11-586D19.1;SORCS2 1664;2678;7285;12968;21282;21382;24008;54642;86642;119558;214509;346119;347166;364173;374155;399838;424884;455292;492405;494000;497805

The trans-eQTL “4:6696460” (with target module 1) is nearest to gene S100P. The last column also gives all genes within 1Mb of this signal.

Annotate transcription factors and interesting genes near trans signals

I wanted to look at what transcription factors or master trans regulator genes are near the identified trans-eQTLs. And then annotate them on a gene manhattan plot.

# I/O & paras -----
file_signal_cis_genes <- '/project2/xuanyao/llw/DGN_no_filter_on_mappability/postanalysis/signal_cis_genes.txt'
file_chr_pos <- '/scratch/midway2/liliw1/sig_module_chr/chromosome_location.rds'


# read files -----
signal_cis_genes <- fread(file_signal_cis_genes, header = TRUE)
chr_pos <- readRDS(file_chr_pos)


# one p for one snp across modules -----
snp_cis_genes <- signal_cis_genes %>%
  group_by(SNP, chr, pos, nearest_gene, near_genes, nearest_dis, near_dis) %>%
  summarise('p' = min(p),
            'n_module' = n()) %>%
  mutate('-logp' = -log10(p)) %>%
  ungroup()

snp_cis_genes <- separate_rows(
  snp_cis_genes,
  near_genes, near_dis,
  sep = ";", convert = TRUE
)

# change 0 p -----
snp_cis_genes[is.infinite(snp_cis_genes$`-logp`), "-logp"] <-
  (max(snp_cis_genes$`-logp`[!is.infinite(snp_cis_genes$`-logp`)]) + 1) %>% ceiling()


# genes of interest -----
gene_of_interest <- c("BAZ2B", "NFKBIA", "PLAGL1", "NFE2", "IKZF1", "KLF1", "KLF14", "NFKB1", "NFKBIA", "ZNF229", "BAZ2B", #TF
                      "ARHGEF3", "SENP7")


# nearest & near gene of interest col -----
snp_cis_genes <- mutate(
  snp_cis_genes,
  "if_gene_of_interest_nearest" = nearest_gene %in% !!gene_of_interest,
  "if_gene_of_interest_near" = near_genes %in% !!gene_of_interest,
)

plt_dat <- snp_cis_genes %>%
  left_join(chr_pos, by = c("chr" = "CHR")) %>%
  mutate(def_pos = pos + tot)


# same annotation, pick the smallest p ----
annot_gene <- filter(plt_dat, if_gene_of_interest_near) %>%
  mutate("label_gene_of_interest" = near_genes) %>%
  group_by(label_gene_of_interest) %>%
  summarise(`-logp` = max(`-logp`),
            chr = chr, pos = pos, def_pos = def_pos) %>%
  ungroup() %>%
  distinct(label_gene_of_interest, .keep_all = TRUE)


annot_snp <- filter(plt_dat, if_gene_of_interest_near) %>%
  mutate("label_gene_of_interest" = near_genes)

These are the genes of interest to be annotated,

knitr::kable(annot_gene)
label_gene_of_interest -logp chr pos def_pos
ARHGEF3 17.000000 3 56848953 549073711
BAZ2B 7.974534 2 160427158 409601840
IKZF1 17.000000 7 50258234 1282856609
KLF14 7.640718 7 130536491 1363134866
NFE2 8.204420 12 54685880 2004114576
NFKB1 8.915563 4 103444474 793508162
NFKBIA 17.000000 14 35372518 2233699109
PLAGL1 17.000000 6 144278562 1205967141
SENP7 17.000000 3 100831857 593056615
ZNF229 17.000000 19 44889660 2702138865

Plot these genes on the manhattan plot,

# Manhattan plot of cis TF's -----
base_plt <- ggplot(plt_dat, aes(x = def_pos, y = `-logp`)) +
  geom_rect(
    aes(xmin = tot, xmax = xmax,
        ymin = -Inf, ymax = Inf,
        fill = factor(chr))
  ) +
  geom_point(aes(color = factor(chr)), alpha = 0.5, size = 1, shape = 16) +
  geom_point(data = annot_snp,
             color = "#e2bebe", alpha = 0.5, size = 1) +
  geom_point(data = annot_gene,
             color = "#b20000", fill = "#b20000", shape=23, size = 3) +
  geom_text_repel(data = annot_gene,
                  aes(label = label_gene_of_interest),
                  segment.colour="black",
                  size = 4,
                  min.segment.length = 0,
                  max.overlaps = 5,
                  nudge_x = -0.5,
                  nudge_y = 30,
                  box.padding = 1,
                  segment.curvature = -0.1,
                  segment.ncp = 5,
                  segment.angle = 20,
                  direction = "y",
                  hjust = "left",
                  segment.linetype = 6,
                  arrow = arrow(length = unit(0.015, "npc"))
  ) +
  labs(x = "Chromosome", y = quote(-Log[10](P)))

base_plt +
  scale_x_continuous(
    limits = c(0, max(chr_pos$center)*2 - max(chr_pos$tot)),
    label = chr_pos$CHR,
    breaks = chr_pos$center,
    expand = c(0, 0)
  ) +
  scale_y_continuous(expand = c(0, 0), limits = c(6.5, 21) ) +
  scale_color_manual(values = rep(c("#7e7e7e", "#bfbfbf"), 22), guide = "none") +
  scale_fill_manual(values = rep(c("#efefef", "#ffffff"), 22), guide = "none") +
  theme_my_pub()

Session info

sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /scratch/midway2/liliw1/conda_env/rstudio-server/lib/libopenblasp-r0.3.18.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggrepel_0.9.1     forcats_0.5.1     stringr_1.4.0     dplyr_1.0.7      
##  [5] purrr_0.3.4       readr_2.1.2       tidyr_1.2.0       tibble_3.1.7     
##  [9] ggplot2_3.3.6     tidyverse_1.3.1   data.table_1.14.2
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.1.1 xfun_0.29        bslib_0.3.1      haven_2.4.3     
##  [5] colorspace_2.0-3 vctrs_0.4.1      generics_0.1.2   htmltools_0.5.2 
##  [9] yaml_2.2.2       utf8_1.2.2       rlang_1.0.3      jquerylib_0.1.4 
## [13] pillar_1.7.0     withr_2.5.0      glue_1.6.2       DBI_1.1.3       
## [17] dbplyr_2.1.1     modelr_0.1.8     readxl_1.3.1     lifecycle_1.0.1 
## [21] cellranger_1.1.0 munsell_0.5.0    gtable_0.3.0     rvest_1.0.2     
## [25] evaluate_0.14    labeling_0.4.2   knitr_1.37       tzdb_0.2.0      
## [29] fastmap_1.1.0    fansi_1.0.3      highr_0.9        Rcpp_1.0.8.3    
## [33] broom_0.7.12     backports_1.4.1  scales_1.2.0     jsonlite_1.7.3  
## [37] farver_2.1.1     fs_1.5.2         hms_1.1.1        digest_0.6.29   
## [41] stringi_1.7.6    grid_4.1.2       cli_3.3.0        tools_4.1.2     
## [45] magrittr_2.0.3   sass_0.4.0       crayon_1.5.1     pkgconfig_2.0.3 
## [49] ellipsis_0.3.2   xml2_1.3.3       reprex_2.0.1     lubridate_1.8.0 
## [53] assertthat_0.2.1 rmarkdown_2.11   httr_1.4.2       rstudioapi_0.13 
## [57] R6_2.5.1         compiler_4.1.2