BCPLS Marginal Analysis

Motivation

Marginal correlation analysis of proteogenomic breast cancer data is a computationally efficient method that carries the advantage of being applied genome-wide, while the graphical model spaceMap is not yet able to model genome-wide features. However, marginal analysis lacks the same modeling precision of graphical models and is prone to overfitting. The differences in feature resolution and number of features typically input to these two strategies make them difficult to compare. The following presentation compares the marginal analysis approach with spaceMap on a highly-variable subset of proteins and large copy number aberations intervals (CNA) of several megabases—expliclity using the same feature input. This comparison avoids comparing a marginal analysis with gene-level CNA input (as was done in the original BCPLS publication) with a spaceMap analysis with large genomic CNA. The outcome of this analysis clearly highlights the advantages spaceMap has over a marginal analysis when the feature input is the same.

Data import

Import the protein and the CNA data that was used to fit spaceMap to the breast cancer data associated with the BCPLS study.

library(spacemap)
data("bcpls")
attach(bcpls)
#useful for reporting enriched hubs/modules later. 
library(AnnotationDbi)
library(GO.db)
process_alias <- AnnotationDbi::Term(names(go2eg))
#read protein abundance and CNA data 
library(Biobase)
protset <- readRDS(file = "data/prot-expression-set.rds")
y <- t(exprs(protset))
colnames(y) <- yinfo$id
cnaset <- readRDS(file = "data/cna-expression-set.rds")
x <- t(exprs(cnaset))
colnames(x) <- xinfo$id

Pearson Correlation Analysis

We will compute the P-values from the Pearson correlation statistic for each CNA–protein and protein-protein pair with these function.

xyPvalues <- function(x, y, method = c("pearson", "spearman"), rdsfile) { 
  method = match.arg(arg = method, choices = c("pearson", "spearman"))
  require(foreach)
  pvals <- foreach(p = seq_len(ncol(x)), .combine = 'rbind') %:% 
    foreach(q = seq_len(ncol(y)), .combine = 'c') %dopar% { 
      cor.test(x = x[,p], y = y[,q], method = method)$p.value
    }
  colnames(pvals) <- colnames(y)
  rownames(pvals) <- colnames(x)
  saveRDS(object = pvals, file = rdsfile)
  pvals
}

yyPvalues <- function(y, method = c("pearson", "spearman"), rdsfile) { 
  method = match.arg(arg = method, choices = c("pearson", "spearman"))
  require(foreach)

  #generate lower triangle indices to iterate over
  rows <- nrow(y)  
  z <- sequence(rows)
  indices <- cbind(  row = unlist(lapply(2:rows, function(x) x:rows), use.names = FALSE),
                     col = rep(z[-length(z)], times = rev(tail(z, -1))-1))
  
  pvals <- foreach(j = seq_len(nrow(indices)), .combine = 'c') %dopar% {
    cor.test(x = y[,indices[j,"row"]], y = y[,indices[j,"col"]], method = method)$p.value
  }
  edge_list <- data.frame(from = colnames(y)[indices[,"row"]], to = colnames(y)[indices[,"col"]], pvalue = pvals)
  saveRDS(object = edge_list, file = rdsfile)
  edge_list
}

Compute the P-values for each protein-CNA pair and save them to file because this takes 20-30 minutes with 7 cores (pre-computed).

pxyfile <- "~/scratch-data/neta-bcpls/marginal/xy_pearson_protein_bcpls.rds"
stopifnot(!dir.exists(basename(pxyfile)))
library(doParallel)
cl <- makeCluster(detectCores() - 1)
registerDoParallel(cl)
pxypval <- xyPvalues(x = x, y = y, method = "pearson", rdsfile = pxyfile)

Compute the P-values for each protein–protein pair.

pyyfile <- "~/scratch-data/neta-bcpls/marginal/yy_pearson_protein_bcpls.rds"
stopifnot(!dir.exists(basename(pyyfile)))
pyypval <- yyPvalues(y = y, method = "pearson", rdsfile = pyyfile)

Read in the pre-computed P-values from file.

pxypval <- readRDS("~/scratch-data/neta-bcpls/marginal/xy_pearson_protein_bcpls.rds")
pyypval <- readRDS("~/scratch-data/neta-bcpls/marginal/yy_pearson_protein_bcpls.rds")

Form the adjacency matrices of the marginal (Pearson correlation) network. Apply the Benjamini-Hochberg correction to every (CNA–protein, protein–protein) interaction and then keep only those interactions that pass an FDR < 0.05. This method produces what we refer to hereafter as the “marginal network” from the significant protein-CNA interactions based on Pearson correlations (FDR < 0.05). Note that varying the FDR produces relatively the same number of \(y-y\) edges, while the number of \(x-y\) edges increaes dramatically with increasing FDR suggesting many of the \(x-y\) edges are not very stable.

adjMatCor <- function(xp, dyp, x, y, thresh = 0.1) { 
  
  #partition and index x--y vs y--y
  xp <- as.numeric(xp)
  yp <- as.matrix(dyp$pvalue)
  nxp <- length(xp)
  nyp <- length(yp)
  xi <- 1:nxp
  yi <- (nxp + 1):(nxp +  nyp)
  
  #global adjustment
  gp <- p.adjust(p = c(xp, yp), method = "BH")
  gyp <- gp[yi]
  gxp <- gp[xi]
  #make x--y adjacency matrix
  mgxp <- matrix(data = gxp, nrow = ncol(x), ncol = ncol(y))
  adjxy <- (mgxp <= thresh) + 0
  rownames(adjxy) <- colnames(x)
  colnames(adjxy) <- colnames(y)
  
  #make y--y adjacency matrix
  dyp$fdr <- gyp
  make_yy_adj_matrix <- function(el) { 
    ay <- matrix(data = 0L, nrow = ncol(y), ncol = ncol(y))
    rownames(ay) <- colnames(y)
    colnames(ay) <- colnames(y)
    for(j in seq_len(nrow(el))) { 
      if (el[j,"fdr"] <= thresh) { 
        ay[el[j,"from"], el[j,"to"]] <- 1L
        ay[el[j,"to"], el[j,"from"]] <- 1L
      }
    }
    ay
  }
  adjyy <- make_yy_adj_matrix(el = dyp) 
  
  #return list of adjacency matrices
  list(yy = adjyy, xy = adjxy)
}
marg <- adjMatCor(xp = pxypval, dyp = pyypval, x = x, y = y, thresh = 0.05)
#check to make feature labels are correct.
stopifnot(all(colnames(x) == xinfo$id), all(colnames(y) == yinfo$id))

Compare marginal vs spaceMap networks

Now we will compare the marginal network based on the Pearson correlation and spaceMap-based CNA–protein networks for breast cancer.

Form the marginal network.

imarg <- adj2igraph(yy = marg$yy, xy = marg$xy, yinfo = yinfo, xinfo = xinfo)

Import spaceMap’s Boot.Vote network.

pgraphml <- "/Users/bioinformatics/repos/neta-bcpls/neta/spacemap-prot-boot-vote.graphml"
stopifnot(file.exists(pgraphml))
library(igraph)
ig <- read_graph(file = pgraphml, format = "graphml")

Subset the marginal network and spaceMap Boot.Vote network into their \(y-y\) and \(x-y\) subnetworks to aid the comparisons. Intersect the two networks to see what is in common.

imargxy <- igraph::subgraph.edges(imarg, eids = E(imarg)[levels %in% "x-y"])
igxy <- igraph::subgraph.edges(ig, eids = E(ig)[levels %in% "x-y"])
imargyy <- igraph::subgraph.edges(imarg, eids = E(imarg)[levels %in% "y-y"])
igyy <- igraph::subgraph.edges(ig, eids = E(ig)[levels %in% "y-y"])
# what is common
iixy <- igraph::intersection(igxy, imargxy)
iiyy <- igraph::intersection(igyy, imargyy)

There are 2103 CNA–protein edges in the marginal network compared to 585 in spaceMap’s network. The two networks have 163 CNA–protein edges in common. There are 535 protein–protein edges in the marginal network compared to 954 in spaceMap’s network. The two networks have 10 proteins–protein edges in common. Next we compare the CNA hub degree distributions between the two networks.

#plot CNA hub degree distributions. 
library(ggplot2)
marg_cna_deg <- degree(imargxy)[V(imargxy)[levels %in% "x"]]
ig_cna_deg <- degree(igxy)[V(igxy)[levels %in% "x"]]
degcna <- data.frame(degree = c(marg_cna_deg, ig_cna_deg),
                     network = rep(c("marginal", "spaceMap"), 
                                   times = c(length(marg_cna_deg),
                                             length(ig_cna_deg))))
ggplot(data = degcna, aes(x = degree, fill = network)) + 
  geom_histogram(binwidth = 3) + facet_grid(network ~ ., scale = "free") + 
  xlab("CNA hub degree") + 
  theme_bw()

There are 528 CNA hubs in the marginal network with 47.9 percent having only one degree. There are 15 CNA hubs in the spaceMap network with 6.7 percent having only one degree. Next, we identify the specific CNA hubs for both networks.

marg_xhubs <- as_ids(V(imargxy)[igraph::degree(imargxy) > 0 & V(imargxy)$levels %in% "x"])
ig_xhubs <- as_ids(V(igxy)[igraph::degree(igxy) > 0 & V(igxy)$levels %in% "x"])

The 15 common CNA hubs between marginal and spaceMap networks are consistent with the original BCPLS publication, which reported “5q (loss of heterozygosity (LOH) in basal; gain in luminal B), 10p (gain in basal), 12 (gain in basal), 16q (luminal A deletion), 17q (luminal B amplification), and 22q (LOH in luminal and basal)”. Notably, only 22q is absent from the spaceMap analysis and only a few small-degree CNA hubs (mean degree of 1.19) from 22q are represented by the marginal analysis. The evidence leaves open the possiblity that the the variance filter applied to the original data may have omitted proteins that are perturbed by 22q; or that the CNA hub-wise FDR filter applied in the original BCPLS study , rather than a global FDR correction, allowed the BCPLS authros to report more CNA hub regions.

Common CNA hub	spaceMap degree	marginal degree
5p12-5q11.1	25	7
5q34	129	50
8q21.2-22.1	1	2
10p15.1-15.3	86	27
11q13.5-14.1	11	4
12q21.1	4	5
15q13.1-15.1	36	3
16p12.1-12.3	21	4
16q22.1	16	10
16q22.1-22.2	72	27
17p11.2	7	4
17p11.2	5	3
17q12	52	8
17q21.32	99	26
17q23.1-23.2	21	4

There are no CNA hubs unique to spaceMap.

#as_data_frame(x = ig, what = "vertices")[setdiff(ig_xhubs, marg_xhubs),]
ucna_smap <- xinfo$alias[match(setdiff(ig_xhubs, marg_xhubs), xinfo$id)]
ucna_smap

## character(0)

The CNA hubs unique to the marginal network have generally much smaller degree than the spaceMap network and are represented across 19 chromosomes. In the following table, we list chromosome with the frequency of reported CNA hubs unique to the marginal analysis. The mean degree of the CNA hubs in the marginal network is 3.9829545, while for spaceMap it is 39. The marginal network clearly reports more CNA perturbing the genome, but the effect is more diminished.

ucna_marg <- xinfo$alias[match(setdiff(marg_xhubs, ig_xhubs), xinfo$id)]
ucna_marg_df <- data.frame(cytoband = ucna_marg, id = setdiff(marg_xhubs, ig_xhubs))
ucna_marg_tab <- table(sapply(strsplit(ucna_marg, split = "p|q"), function(x) x[1]))
ucna_marg_tab2 <- as.data.frame(ucna_marg_tab)
ucna_marg_tab2 <- ucna_marg_tab2[with(ucna_marg_tab2, order(Freq, decreasing = T)),]
colnames(ucna_marg_tab2) <- c("Chromosome", "# of CNA hubs Unique to Marginal Analysis")
mean_chr_degree <- function(chr) { mean(degree(imargxy)[as.character(ucna_marg_df$id[grep(chr, ucna_marg_df$cytoband)])]) }
ucna_marg_tab2$`Mean Degree` <- round(sapply(ucna_marg_tab2$Chromosome, mean_chr_degree),2)
knitr::kable(ucna_marg_tab2, row.names = F)

Chromosome	# of CNA hubs Unique to Marginal Analysis	Mean Degree
17	113	2.50
11	69	2.97
12	60	2.27
5	34	9.19
14	29	4.00
8	24	1.28
15	22	2.49
2	22	3.17
16	21	4.74
6	16	3.14
10	15	3.53
9	15	2.08
20	13	1.15
18	11	1.27
3	11	4.81
4	10	3.95
19	9	1.56
1	8	3.10
22	6	2.55
7	3	2.47
21	2	2.49

Marginal Hub Analysis

Rank the protein and CNA nodes, resptively.

imarg <- rankHub(ig = imarg, level = "y")
imarg <- rankHub(ig = imarg, level = "x")

Next label \(x-y\) edges as being regulated in cis or in trans.

library(GenomicRanges)
imarg <- cisTrans(ig = imarg, level = "x-y")

We report the top 50 (degree-ranked) CNA hubs for the marginal network. The marginal analysis identifies much less than cis edges than one would expect, which is similar to the spaceMap network. Further the marginal network reports a large number of redundant CNA hubs compared to the spaceMap network, where redundant is defined to be CNA hubs of very close genomic distance and share very similar edges. For example there are at least 5 redundant 17q12 hubs, 2 redundant edges for 17q25, and 3 redundant edges for 6q21.

xhubs <- reportHubs(imarg, top = 50, level = "x")

hub	# cis/ # trans	Potential # cis	cis genes
5q34 (160-170 Mb)	0 / 50	0	–
5q35.3 (180-180 Mb)	0 / 47	6	–
5q33.3-34 (160-160 Mb)	0 / 43	1	–
5q35.3 (180-180 Mb)	0 / 42	6	–
5q14.3 (88-89 Mb)	1 / 38	2	CETN3
5q14.3 (89-91 Mb)	1 / 37	2	CETN3
5q35.2-35.3 (180-180 Mb)	0 / 37	8	–
5q34 (170-170 Mb)	0 / 36	0	–
5q33.1-33.3 (150-160 Mb)	0 / 35	8	–
5q35.2 (170-180 Mb)	0 / 35	8	–
5q11.2-14.3 (59-88 Mb)	2 / 32	15	MCCC2, CETN3
5q23.2-23.3 (130-130 Mb)	0 / 34	4	–
5q14.3-23.2 (91-130 Mb)	2 / 31	12	NUDT12, CETN3
5q34-35.1 (170-170 Mb)	0 / 33	2	–
5q23.2 (130-130 Mb)	0 / 31	3	–
5q23.3-33.1 (130-150 Mb)	0 / 31	16	–
10p15.1-15.3 (0.42-4 Mb)	0 / 27	10	–
16q22.1-22.2 (68-71 Mb)	2 / 25	13	GLG1, GLG1
5q34 (170-170 Mb)	0 / 26	0	–
17q21.32 (46-46 Mb)	0 / 26	14	–
5q11.2 (52-53 Mb)	0 / 21	3	–
5q11.2 (58-59 Mb)	0 / 21	6	–
5q35.2 (170-170 Mb)	0 / 20	7	–
5q11.2 (53-56 Mb)	0 / 19	7	–
5q11.2 (57-58 Mb)	0 / 19	6	–
5q11.2 (58-58 Mb)	0 / 19	6	–
16q21-22.1 (64-68 Mb)	0 / 18	8	–
5q11.2 (56-57 Mb)	0 / 17	7	–
5q35.2 (170-170 Mb)	0 / 17	7	–
10p14-15.1 (4-12 Mb)	1 / 16	13	GATA3
5q35.1 (170-170 Mb)	0 / 16	2	–
5q35.2 (170-170 Mb)	0 / 16	7	–
17q21.32 (45-46 Mb)	0 / 16	15	–
5q11.2 (56-56 Mb)	0 / 15	7	–
16q23.1-24.3 (78-89 Mb)	2 / 13	11	GLG1, GLG1
5q11.2 (57-57 Mb)	0 / 14	6	–
5q33.3 (160-160 Mb)	0 / 13	1	–
5q35.1 (170-170 Mb)	0 / 12	2	–
12q13.2-13.3 (56-57 Mb)	4 / 7	28	KRT8, KRT18, SUOX, ZNF385A
12q13.3 (57-57 Mb)	4 / 7	19	KRT8, KRT18, SUOX, ZNF385A
17q21.32 (46-46 Mb)	4 / 7	14	MAPT, MAPT, MAPT, MAPT
5q35.1-35.2 (170-170 Mb)	0 / 10	4	–
11p15.2 (13-14 Mb)	0 / 10	1	–
16q22.1 (68-68 Mb)	0 / 10	8	–
5q35.2 (170-170 Mb)	0 / 9	7	–
16q22.2 (71-72 Mb)	2 / 7	10	GLG1, GLG1
12q14.1 (59-59 Mb)	1 / 7	6	SUOX
17q12 (38-38 Mb)	4 / 4	34	ERBB2, GRB7, PNMT, MIEN1
5p12-5q11.1 (45-50 Mb)	0 / 7	4	–
5q11.1-11.2 (50-52 Mb)	0 / 7	3	–

The marginal network identified the 5q arm as having high degree and having also perturbed proteins with common functional roles according to the GO-neighbor percentage. This result conforms well to the spaceMap and original BCPLS analysis, which highlight the 5q arm loss in the basal molecular subtype.

hgp <- xHubEnrich(ig = imarg, go2eg = go2eg)
hgp2 <- hgp[hgp$degree > 10,]
kable(hgp2[order(hgp2$neighbor_percentage, decreasing = T),], row.names = FALSE)

hub	degree	neighbor_percentage
10p15.1-15.3	27	85.18519
5q33.3	13	84.61538
5q35.2	17	82.35294
5q34	36	80.55556
5q33.1-33.3	35	80.00000
5q34-35.1	33	78.78788
5q35.2	35	77.14286
5q34	26	76.92308
10p14-15.1	17	76.47059
5q35.1	16	75.00000
5q35.1	12	75.00000
5q35.2	16	75.00000
5q35.3	47	74.46809
5q23.3-33.1	31	74.19355
16q23.1-24.3	15	73.33333
16q21-22.1	18	72.22222
5q14.3	39	71.79487
5q35.2	20	70.00000
5q33.3-34	43	69.76744
17q21.32	26	69.23077
5q14.3	38	68.42105
5q11.2-14.3	34	67.64706
5q35.2-35.3	37	67.56757
16q22.1-22.2	27	66.66667
5q35.3	42	66.66667
5q11.2	21	66.66667
5q11.2	21	66.66667
5q14.3-23.2	33	66.66667
5q23.2-23.3	34	64.70588
5q11.2	14	64.28571
5q34	50	64.00000
17q21.32	11	63.63636
5q11.2	19	63.15789
5q11.2	19	63.15789
5q11.2	19	63.15789
5q11.2	17	58.82353
5q23.2	31	58.06452
17q21.32	16	56.25000
5q11.2	15	46.66667
12q13.2-13.3	11	45.45455
12q13.3	11	45.45455

Marginal Module Analysis

Use the edge-betweenness algorithm to cluster.

library(igraph)
mods <- cluster_edge_betweenness(imarg)
mod_tab <- table(mods$membership)

There are 15 modules of size 15 or greater.

The marginal network GO-enrichment is very limited—finding 9 GO-terms enriched—especially so when compared to spaceMap, which found 45 GO-enriched terms. Both methods identify common GO-enrichment of proteins involved in transcriptional regulation that are in trans-association with 5q as well as ion transmembrane transport associated with 17q21.32.

outmod <- modEnrich(ig = imarg, mods = mods, levels = "y", 
                    go2eg = go2eg, 
                    prefix = "MARG", process_alias = process_alias)
knitr::kable(outmod$etab, row.names = FALSE)

Module (size)	GO Category	GO Obs./ Total	X-hubs (hits)
MARG1 (81)	axon guidance	11/58
MARG11 (17)	Notch signaling pathway	5/15	14q12 (6); 14q21.2 (6)
	cell migration	5/25
	cell adhesion	5/105
MARG14 (16)	brain development	5/23	17q21.32 (8); 17q21.32 (5)
MARG2 (188)	transcription from RNA polymerase II promoter	8/20	5q34 (45); 5q35.3 (44)
	positive regulation of transcription from RNA polymerase II promoter	11/52
MARG4 (47)	ion transmembrane transport	6/21	17q21.32 (20); 17q21.32 (14)
	transmembrane transport	7/45

Export for visualization.

filename <- "~/repos/neta-bcpls/neta/marginal-prot-boot-vote.graphml"
#only keep vertices that have edges for visualization
vis <- delete_vertices(graph = outmod$ig, v = V(outmod$ig)[igraph::degree(outmod$ig) == 0])
igraph::write_graph(graph = vis, file = filename, format = "graphml")

Summary

In comparing the marginal network to spaceMap’s Boot.Vote network, we see the advantages of spaceMap. The commonality of CNA hubs between the two analyses inspires confidence in the novel spaceMap method. Further, spaceMap reports less redundant and more prominent CNA hubs through the use of the group penalty term; this has the effect of potentially narrowing down the region of potential cancer drivers better than a marginal analysis and take this as evidence of spaceMap’s more direct modeling approach. In the \(y-y\) edge space, there is much more GO-enrichment of the spaceMap method, showing it is more useful for extracting biological insight from high-dimensional proteogenomic data than marginal network analysis.