Classification of protein tail-anchoring locations.
This code was use for the analysis in the paper “Predicting the targeting of tail-anchored proteins to subcellular compartments in mammalian cells” by Costello et al. (2017). Journal of Cell Science 2017 130: 1675-1687; doi: 10.1242/jcs.200204
https://jcs.biologists.org/content/130/9/1675.short
Code by Doug McNeall based on data by Joe Costello.
The code currently uses a support vector machine (SVM) for classification of tail-anchoring location, depending on TMD hydrophobicity and tail charge.
Code is here: https://github.com/dougmcneall/protein/blob/master/docs/protein.Rmd
set.seed(42)
library(RColorBrewer)
library(e1071)
library(MASS)
paired <- brewer.pal(6,'Paired')
blues <- brewer.pal(9,'Blues')
mitoblue <- '#A2D9F7'
pogreen <- '#A8D4AF'
ersalmon <- '#F6B096'
catpat <- paired
catpat[1] <- ersalmon
catpat[2] <- mitoblue
catpat[6] <- pogreen
catpat[3] <- paired[6]
catpat[4] <- paired[2]
catpat[5] <- paired[4]
truncpat <- c(ersalmon, mitoblue, pogreen)
protein <- read.csv('../data/protein.csv')
# Or, we could just load up the truncated file (added 10th December 2015)
# This breaks the colour palettes at the moment.
protein_trunc <- read.csv('../data/protein_trunc.csv')
# factors are alphabetical, so as.numeric gives
# ER = 1
# MIT0 = 2
# MITO/ER = 3
# MITO/PO = 4
# MITO/PO/ER = 5
# PO = 6
# Training data
train <- subset(protein, select = c('tmd_gravy', 'tail_charge'))
cl <- protein$location
# Data truncated to just the main 3 classes
#protein_trunc <- subset(protein, location=='ER' | location=='MITO' | location=='PO')
train_trunc <- subset(protein_trunc, select = c('tmd_gravy', 'tail_charge'))
cl_trunc <- protein_trunc$location
upper <- apply(train_trunc, 2, max)
lower <- apply(train_trunc, 2, min)
tmd_gravy_seq <- seq(from=lower[1], to=upper[1], length.out = 50)
tail_charge_seq <- seq(from=lower[2], to=upper[2], length.out = 50)
# A grid of test points that covers the entire data space
test <- as.matrix(expand.grid(tmd_gravy_seq, tail_charge_seq))
colnames(test) <- colnames(train_trunc)
# skip the last row, as there is no tail charge
target<- read.table('../data/protein_target.txt', sep = '\t', nrows = 244, header = TRUE)
colnames(target) <- colnames(train_trunc)
mutants <- read.csv('../data/mutants.csv')
# A little aside to get the factors in the data sets to be the same. rbind()
# changes the as.numeric factors to match.
combined <- rbind(protein, mutants)
mutants.ix <- seq(from = (nrow(protein)+1), to = nrow(protein)+nrow(mutants), by = 1)
mutants_refactor <- combined[mutants.ix, ]
# "Shared location" data
protein_shared <- subset(protein, location=='MITO/ER' | location=='MITO/PO' | location=='MITO/PO/ER')
# some new test sites
test.sites <- data.frame(cbind(c(1.54, 1.43, 2.34), c(3.9, 4.6, -0.1)))
colnames(test.sites) <- c('tmd_gravy', 'tail_charge')
#1) TMD GRAVY = 1.54 Charge = 3.9
#2) TMD GRAVY = 1.43 Charge = 4.6
#3) TMD GRAVY = 2.34 Charge = -0.1
par(las = 1, cex = 1.3)
plot(protein$tmd_gravy, protein$tail_charge, bg = catpat[protein$location],
pch = 21, col = 'black',
xlab = 'tmd_gravy', ylab = 'tail_charge')
points(mutants_refactor$tmd_gravy, mutants_refactor$tail_charge, col = 'black', bg = catpat[mutants_refactor$location], pch = 24)
legend('topleft', legend = unique(protein$location), pt.bg = catpat[unique(protein$location)], pch = 21, text.col = catpat[unique(protein$location)], cex = 0.8, bty = 'n' )
legend('top', legend = c( 'triangles indicate mutants'), pch = c(24), cex = 0.8, bty = 'n')
#dev.print(pdf, file='protein_and_mutants.pdf', width = 6, height = 6)
# Data truncated to just the 3 main classes
#dev.new()
par(las = 1, cex = 1.3)
plot(protein_trunc$tmd_gravy, protein_trunc$tail_charge, col = 'black', bg = truncpat[protein_trunc$location], pch = 21,
xlab = "tmd_gravy", ylab = "tail_charge")
legend('topleft', legend = unique(protein_trunc$location), col = 'black', pt.bg = truncpat[unique(protein_trunc$location)], pch = 21, text.col = truncpat[unique(protein_trunc$location)] )
#dev.print(pdf, file='protein_trunc.pdf', width = 6, height = 6)
Train a non-probabilistic support vector machine (SVM) on the truncated data, and visualise the decision boundaries.
# Non-probabilistic SVM fit
svm.fit <- svm(location~., data = protein_trunc, probability = FALSE)
svm.pred <- predict(svm.fit, newdata=test, probability = FALSE)
# misclassification rate for the svm is ~18%
svm.mcr = 1 - (sum(predict(svm.fit)==protein_trunc$location)/ length(protein_trunc$location))
par(las = 1, cex = 1.3)
plot(test, col = truncpat[svm.pred], pch = 20, cex = 0.8, main = 'SVM classifier prediction')
points(protein_trunc$tmd_gravy, protein_trunc$tail_charge, col = 'black', bg = truncpat[protein_trunc$location], pch = 21)
legend('topleft', legend = unique(protein_trunc$location),
pt.bg = truncpat[unique(protein_trunc$location)],
pch = 21,
col = 'black',
text.col=truncpat[unique(protein_trunc$location)],
bg = 'white')
#dev.print(pdf, file = 'svm_prediction.pdf', width = 6, height = 6)
Train a probabilistic support vector machine (SVM) on the truncated data, and visualise the probability contours.
# Why does this predict some non-seen classes?
svm.fit.prob <- svm(location~., data = protein_trunc, probability = TRUE)
svm.pred.prob <- predict(svm.fit.prob, newdata=test, probability = TRUE)
# How many does the svm fit correctly?
fit.hit <- sum(svm.fit.prob$fitted == protein_trunc$location)
fit.miss <- length(protein_trunc$location) - fit.hit
prob.mcr <- fit.miss / length(protein_trunc$location)
svm.prob.df <- attr(svm.pred.prob,'prob')
mitomat <- matrix(svm.prob.df[,'MITO'], nrow = length(tmd_gravy_seq))
pomat <- matrix(svm.prob.df[,'PO'], nrow = length(tmd_gravy_seq))
ermat <- matrix(svm.prob.df[,'ER'], nrow = length(tmd_gravy_seq))
protein_trunc.fit <- cbind(protein_trunc,as.character(svm.fit.prob$fitted))
colnames(protein_trunc.fit)[4] <- c('predicted')
#write.matrix(protein_trunc.fit, file = 'protein_trunc_fit.txt')
n <- length(protein_trunc$location)
cv.out <- rep(NA, n)
# A simple looping leave-one-out cross validation
for(i in 1:n){
cv.fit <- svm(location~., data = protein_trunc[-i, ], probability = TRUE)
cv.pred <- predict(cv.fit, newdata=protein_trunc[i, ], probability = TRUE)
cv.out[i] <- as.character(cv.pred)
}
loo.hitrate <- sum(cv.out == as.character(protein_trunc$location)) / length(protein_trunc$location)
labcex = 0.8
pt.cex = 1.3
par(las=1, mar = c(4,4,1,1))
plot(test, col = truncpat[svm.pred], type = 'n', main = '',
xlab = 'TMD GRAVY', ylab = 'Tail charge')#, main = 'SVM classifier class probability')
contour(tmd_gravy_seq, tail_charge_seq, mitomat,
add = TRUE, levels = c(0.5,0.6,0.7,0.8,1), col = mitoblue,labcex = labcex)
contour(tmd_gravy_seq, tail_charge_seq, pomat,
add = TRUE, levels = c(0.5,0.6,0.7,0.8,1), col = pogreen, labcex = labcex)
contour(tmd_gravy_seq, tail_charge_seq, ermat,
add = TRUE,levels = c(0.5,0.6,0.7,0.8), col = ersalmon, labcex = labcex)
points(protein_trunc$tmd_gravy, protein_trunc$tail_charge,
col = 'black',bg = truncpat[protein_trunc$location], pch = 21, cex = pt.cex)
points(test.sites$tmd_gravy, test.sites$tail_charge,
col = 'black', pch = 21, lwd = 1.5, cex = pt.cex)
legend('topleft', legend = c(as.character(unique(protein_trunc$location)), 'test'),
col = 'black', pt.bg = c(truncpat[unique(protein_trunc$location)],'white'), pch = 21, text.col = c(truncpat[unique(protein_trunc$location)], 'black'), bg = 'white', pt.lwd = c(rep(1,3),1.5), pt.cex = pt.cex)
#dev.print(pdf, file='svm_probability_contour.pdf', width = 5, height = 5)
#dev.print(tiff, file='svm_probability_contour.tif', width = 480, height = 480)
svm.protein <- svm(location~., data = protein, probability = TRUE)#!!
protein.hit <- sum(svm.protein$fitted == protein$location)
protein.fit <- cbind(protein,as.character(svm.protein$fitted))
colnames(protein.fit)[4] <- c('predicted')
#write.matrix(protein.fit, file = 'protein_fit.txt')
Predict tail anchoring location at new values of tmd_gravy and tail_charge.
# ---------------------------------------------------------
# Predict the location in the target data
# ---------------------------------------------------------
target.prob <- predict(svm.fit.prob, newdata = target, probability = TRUE )
par(las=1)
plot(target, type = 'n', main = 'SVM classifier target predictions')
points(target, col = truncpat[target.prob], pch = 21)
contour(tmd_gravy_seq, tail_charge_seq, mitomat, add = TRUE, levels = c(0.5,0.6,0.7,0.8,1), col = mitoblue)
contour(tmd_gravy_seq, tail_charge_seq, pomat,add = TRUE, levels = c(0.5,0.6,0.7,0.8,1), col = pogreen)
contour(tmd_gravy_seq, tail_charge_seq, ermat,add = TRUE, levels = c(0.5,0.6,0.7,0.8), col = ersalmon)
points(protein_trunc$tmd_gravy, protein_trunc$tail_charge, col = 'black',bg = truncpat[protein_trunc$location], pch = 21)
legend('topright', legend = unique(protein_trunc$location), col = 'black', pt.bg = truncpat[unique(protein_trunc$location)], pch = 21, text.col = truncpat[unique(protein_trunc$location)], bg = 'white')
#dev.print(pdf, file = 'protein_target.pdf', width = 6, height = 6)
#write.matrix(cbind(target,attr(target.prob, 'probabilities'),target.prob), file = 'target_predictions.txt')
# A diagram which has the "mutants" data on top of the
# probability contours for the three unique classes.
#dev.new(width = 6, height = 6)
par(las=1)
plot(target, type = 'n', main = '', xlim = c(0.95,2.8), ylim = c(-3.2,5.6))
contour(tmd_gravy_seq, tail_charge_seq, mitomat, add = TRUE, levels = c(0.5,0.6,0.7,0.8,1), col = catpat[2])
contour(tmd_gravy_seq, tail_charge_seq, pomat,add = TRUE, levels = c(0.5,0.6,0.7,0.8,1), col = catpat[6])
contour(tmd_gravy_seq, tail_charge_seq, ermat,add = TRUE, levels = c(0.5,0.6,0.7,0.8), col = catpat[1])
points(protein_trunc$tmd_gravy, protein_trunc$tail_charge, col = catpat[protein_trunc$location], pch = 21)
points(mutants_refactor$tmd_gravy, mutants_refactor$tail_charge, col = 'black', bg = catpat[mutants_refactor$location], pch = 24)
text(mutants_refactor$tmd_gravy, mutants_refactor$tail_charge, col = catpat[mutants_refactor$location], labels = letters[1:length(mutants_refactor$location)], pos = 2)
legend('bottomright', legend = unique(protein$location), text.col = catpat[unique(protein$location)],cex = 0.8, text.font = 2, bty = 'n')
par(xpd = TRUE)
#legend('topright', legend = unique(protein$location), text.col = catpat[unique(protein$location)])
legend('top', legend = 'denotes mutant', text.col = 'black', pch = 24, pt.bg = c( 'grey'), inset = c(0, -0.1), bty = 'n')
#dev.print(pdf, file='mutants_foreground.pdf', width = 6, height = 6)
# A diagram which has the "shared location" data on top of the
# probability contours for the three unique classes.
#dev.new(width = 6, height = 6)
par(las=1)
plot(target, type = 'n', main = '', xlim = c(0.95,2.8), ylim = c(-3.2,5.6))
contour(tmd_gravy_seq, tail_charge_seq, mitomat, add = TRUE, levels = c(0.5,0.6,0.7,0.8,1), col = catpat[2])
contour(tmd_gravy_seq, tail_charge_seq, pomat,add = TRUE, levels = c(0.5,0.6,0.7,0.8,1), col = catpat[6])
contour(tmd_gravy_seq, tail_charge_seq, ermat,add = TRUE, levels = c(0.5,0.6,0.7,0.8), col = catpat[1])
points(protein_trunc$tmd_gravy, protein_trunc$tail_charge, col = truncpat[protein_trunc$location], pch = 21)
points(protein_shared$tmd_gravy, protein_shared$tail_charge, col = 'black', bg = catpat[protein_shared$location], pch = 22)
text(protein_shared$tmd_gravy, protein_shared$tail_charge, col = catpat[protein_shared$location], labels = letters[1:length(protein_shared$location)], pos = 2)
legend('bottomright', legend = unique(protein$location), text.col = catpat[unique(protein$location)],cex = 0.8, text.font = 2, bty = 'n')
par(xpd = TRUE)
#legend('topright', legend = unique(protein$location), text.col = catpat[unique(protein$location)])
legend('top', legend = 'denotes shared location', text.col = 'black', pch = 22, pt.bg = c( 'grey'), inset = c(0, -0.1), bty = 'n')
#dev.print(pdf, file='shared_foreground.pdf', width = 6, height = 6)
#stop()
filled.contour(tmd_gravy_seq, tail_charge_seq, mitomat,
levels=seq(from=0, to=1, by=0.2) ,
xlab='tmd_gravy', ylab='tail_charge', main='SVM MITO probability', col=blues)
#dev.new()
filled.contour(tmd_gravy_seq, tail_charge_seq, pomat,
levels = seq(from = 0, to = 1, by = 0.2) ,
xlab='tmd_gravy', ylab='tail_charge', main='SVM PO probability', col=blues)
#dev.new()
filled.contour(tmd_gravy_seq, tail_charge_seq, ermat,
levels = seq(from = 0, to = 1, by = 0.2) ,
xlab='tmd_gravy', ylab='tail_charge', main='SVM ER probability', col=blues)
new_data <- read.csv('../data/new_data.csv')
ecunc <- new_data[ , 2:3]
colnames(ecunc) <- colnames(protein)[1:2]
ix <- is.finite( ecunc[, 'tmd_gravy'] )
ecunc_clean <- ecunc[ix, ]
target.prob.ecunc <- predict(svm.fit.prob, newdata=ecunc_clean, probability = TRUE)
svm.prob.df.ecunc <- attr(target.prob.ecunc,'prob')
par(las=1)
# set up the axes
plot(ecunc_clean, type = 'n', main = 'SVM classifier target predictions', xlim = c(0.6,4), ylim = c(-4, 7))
# "highest probability" classification of the new data
points(ecunc_clean, col = truncpat[target.prob.ecunc], pch = 21, cex = 2)
# probability contours from the original model
contour(tmd_gravy_seq, tail_charge_seq, mitomat, add = TRUE, levels = c(0.5,0.6,0.7,0.8,1), col = mitoblue)
contour(tmd_gravy_seq, tail_charge_seq, pomat,add = TRUE, levels = c(0.5,0.6,0.7,0.8,1), col = pogreen)
contour(tmd_gravy_seq, tail_charge_seq, ermat,add = TRUE, levels = c(0.5,0.6,0.7,0.8), col = ersalmon)
# plot the original protein_trunc data
points(protein_trunc$tmd_gravy, protein_trunc$tail_charge, col = 'black',bg = truncpat[protein_trunc$location], pch = 21,
cex = 2)
legend('topright', legend = unique(protein_trunc$location), col = 'black', pt.bg = truncpat[unique(protein_trunc$location)], pch = 21, text.col = truncpat[unique(protein_trunc$location)], bg = 'white')