Annotation of BCPLS

Annotation of gene features is very application specific. In this document, we illustrate how we mapped biological processes from the Gene Ontology (GO) onto protein and RNA expressions. A Gene Ontology (GO) universe is defined to be all the biological processess specifically found among the protein and mRNA features used as input to spaceMap. This document shows how the GO universes were constructed for downstrem GO-enrichment analysis of network modules for the networks prot-net and RNA-net in the spaceMap publication.

Previous Annotations

In processing the mRNA and protein expression data from step 1, we already have formed gene coordinate annotations. These are stored in the ExpresionSet objects. The existing protein annotations are accessed as follows:

suppressPackageStartupMessages(library(Biobase))
protset <- readRDS(file = "data/prot-expression-set.rds")
protinfo <- pData(featureData(protset))

head(protinfo)

id	alias	chr	start	end	strand	description
NP_112598	EPPK1	chr8	143857318	143878464	-	epiplakin
NP_001714	DST	chr6	56457986	56954628	-	bullous pemphigoid antigen 1 isoform 1e precursor
NP_002465	MYH11	chr16	1361145	15857030	-	myosin-11 isoform SM1A
NP_001035202	MYH11	chr16	1361145	15857030	-	myosin-11 isoform SM2B
NP_001070654	MYH14	chr19	50203627	50310544	+	myosin-14 isoform 1
NP_079005	MYH14	chr19	50203627	50310544	+	myosin-14 isoform 2

In similary fashion, we can access the mRNA and CNA annotations. Now we focus on establishing a map from GO terms to features, which will be stored in list format because the mapping is not 1-to-1.

Protein GO universe

Load the protein features and map their identiers from RefSeq accessions to entrez gene accessions. We need the entrez gene accessions to map to the Gene Ontology.

protset <- readRDS(file = "data/prot-expression-set.rds")
suppressPackageStartupMessages(library(mygene))
p2r77map <- queryMany(qterms = featureNames(protset), scopes = "refseq",
                      fields = c("entrezgene"))
p2r77map2 <- data.frame(refseq = p2r77map@listData$query, entrezgene = p2r77map@listData$entrezgene)
saveRDS(p2r77map2, file = "data/prot-refseq-to-entrezgene-map.rds")

Get the entrez genes with GO mappings.

suppressMessages(library(org.Hs.eg.db))

## Warning: package 'AnnotationDbi' was built under R version 3.4.1

## Warning: package 'IRanges' was built under R version 3.4.1

## Warning: package 'S4Vectors' was built under R version 3.4.1

x <- org.Hs.egGO
hs_feature_list <- function(db, eg) { 
  mk <- mappedkeys(db)
  xx <- as.list(db[eg[eg %in% mk]])
}
# Get the entrez gene identifiers that are mapped to a GO ID
mapped_genes <- mappedkeys(x)
entrezgene.qe.go <- as.character(p2r77map2$entrezgene[ p2r77map2$entrezgene %in% mapped_genes])
leg2go <-  hs_feature_list(db = org.Hs.egGO, eg = entrezgene.qe.go)

Map the entrez genes to GO biological process terms with some valid evidence (i.e. not “ND”).

suppressPackageStartupMessages(library(foreach))
leg2go_filtered <- foreach(got = leg2go, eg = names(leg2go)) %do% { 
  bp_domain <- sapply(got, function(x) x[["Ontology"]]) == "BP"
  #bp_domain <- TRUE
  not_nd_domain <- sapply(got, function(x) x[["Evidence"]]) != "ND"
  bp_got <- got[bp_domain & not_nd_domain]
  sapply(bp_got, function(x) x[["GOID"]]) 
}
names(leg2go_filtered) <- names(leg2go)
leg2go_filtered <- leg2go_filtered[sapply(leg2go_filtered, length) != 0]

Reverse the mapping so that the key is a GO biological process and the values are a character vector of RefSeq accessions. Keep those keys that have at least 15 but no more than 300 RefSeq accessions.

suppressMessages(library(topGO))
go2eg <- inverseList(leg2go_filtered)
#switch to refseq id because of graph id and to include all isoforms
go2eg <- lapply(go2eg, function(x) as.character(p2r77map2$refseq[ p2r77map2$entrezgene %in% x ]))
#remove duplicate entrez genes
go2eg <- lapply(go2eg, function(x) x[!duplicated(x)])
#saveRDS(go2eg, file = "~/repos/neta-bccptac/data/no-len-trim-prot-go-bp-to-entrez-gene-list.rds")
gosize <- sapply(go2eg, length)
go2eg <- go2eg[ gosize >= 15 & gosize <= 300]
#assure each go term has a list of non-duplicate ids
stopifnot(all(sapply(go2eg, anyDuplicated) == 0))
eg2go <- inverseList(go2eg)

Save the protein GO universe.

saveRDS(go2eg, file = "data/prot-go-bp-to-entrez-gene-list.rds")
saveRDS(eg2go, file = "data/prot-go-entrez-gene-to-bp-list.rds")

Clear the workspace for the RNA analysis.

rm(list = ls())

RNA GO universe

The identifier is already an entrez accession for the mRNA features. Obtain the entrez genes with their GO mappings.

rnaset <- readRDS(file = "data/rna-expression-set.rds")
rna_entrez_ids <- featureNames(rnaset)
library(IRanges)
library(org.Hs.eg.db)
hs_feature_list <- function(db, eg) { 
  mk <- mappedkeys(db)
  xx <- as.list(db[eg[eg %in% mk]])
}
leg2go <- hs_feature_list(db = org.Hs.egGO, eg = rna_entrez_ids)

Map the entrez accessions to GO biological process terms with some valid evidence (not “ND”).

library(foreach)
leg2go_filtered <- foreach(got = leg2go, eg = names(leg2go)) %do% { 
  bp_domain <- sapply(got, function(x) x[["Ontology"]]) == "BP"
  #bp_domain <- TRUE
  not_nd_domain <- sapply(got, function(x) x[["Evidence"]]) != "ND"
  bp_got <- got[bp_domain & not_nd_domain]
  sapply(bp_got, function(x) x[["GOID"]]) 
}
names(leg2go_filtered) <- names(leg2go)
leg2go_filtered <- leg2go_filtered[sapply(leg2go_filtered, length) != 0]

Reverse the mapping so that the key is a GO biological process and the values are a character vector of entrez gene accessions. Keep those keys that have at least 15 but no more than 300 RefSeq accessions.

suppressPackageStartupMessages(library(topGO))
go2eg <- inverseList(leg2go_filtered)
#remove duplicate entrez genes
go2eg <- lapply(go2eg, function(x) x[!duplicated(x)])
saveRDS(go2eg, file = "~/repos/neta-bccptac/data/no-len-trim-rna-go-bp-to-entrez-gene-list.rds")
gosize <- sapply(go2eg, length)
go2eg <- go2eg[ gosize >= 15 & gosize <= 300]
#number of go terms satisfying conditions
length(go2eg)

## [1] 110

#assure each go term has a list of non-duplicate ids
stopifnot(all(sapply(go2eg, anyDuplicated) == 0))
eg2go <- inverseList(go2eg)

Save the RNA GO universe.

saveRDS(go2eg, file = "data/rna-go-bp-to-entrez-gene-list.rds")
saveRDS(eg2go, file = "data/rna-go-entrez-gene-to-bp-list.rds")

Next Step in Analysis

Please see the Network Analysis article for the next step in the analysis.

Session Info

sessionInfo()

## R version 3.4.0 (2017-04-21)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] topGO_2.28.0         SparseM_1.77         GO.db_3.4.1         
##  [4] graph_1.54.0         foreach_1.4.3        org.Hs.eg.db_3.4.1  
##  [7] AnnotationDbi_1.38.2 IRanges_2.10.3       S4Vectors_0.14.4    
## [10] Biobase_2.36.2       BiocGenerics_0.22.0  knitr_1.17          
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.12       magrittr_1.5       bit_1.1-12        
##  [4] lattice_0.20-35    rlang_0.1.2        stringr_1.2.0     
##  [7] blob_1.1.0         tools_3.4.0        grid_3.4.0        
## [10] DBI_0.7            matrixStats_0.52.2 iterators_1.0.8   
## [13] htmltools_0.3.6    yaml_2.1.14        bit64_0.9-7       
## [16] rprojroot_1.2      digest_0.6.12      tibble_1.3.4      
## [19] codetools_0.2-15   memoise_1.1.0      evaluate_0.10.1   
## [22] RSQLite_2.0        rmarkdown_1.6      stringi_1.1.5     
## [25] compiler_3.4.0     backports_1.1.0    pkgconfig_2.0.1