################ Potential coinfection analyses ################
#### 1. Data Sources and Preparing Data
#### 2. Sources explaining variation in the level of potential coinfection
#### 3. Plotting factors affecting the proportion of tested phage infecting hosts
#### 4. Summary Statistics on Potential Coinfection
#Load libraries
library(ggplot2)
library(XLConnect)
library(reshape2)
library(gridExtra)
library(dplyr)
#Simple function to calculate standar error of the mean
source("sem.R")
#### 1. Data Sources and Preparing Data ####
#Analyzing data from Flores et al. 2011 Reference:
#Flores, C. O., Meyer, J. R., Valverde, S., Farr, L., & Weitz, J. S. (2011). Statistical structure of host-phage interactions.
#PNAS, 108(28), E288-E297. doi:10.1073/pnas.1101595108/
#Import phage-host infection matrices
excel <- loadWorkbook(filename = "data/unprocessed/Flores_et_al_2011_data.xls")
#Now want to offload these sheets into R workable format
sheets <- readWorksheet(excel, sheet = getSheets(excel), header = FALSE)
#getSheets(excel) gives you a vector with the names of the sheets in the excel file
#Now sheets is a list with 38 data frames, each corresponding to a sheet in the file, i.e.
# the infection matrix for each study. But these are data frames, not matrices, so need to change
#lapply will apply a function to each element of a list, let's try with as.matrix()
flores2011 <- lapply(sheets, as.matrix)
is.matrix(sheets$'Zinno 2010')
#[1] FALSE
is.matrix(flores2011$'Zinno 2010')
#[1] TRUE #Looks good.
#Get the sum of all Rows from all the studies.
#In this data set H=rows (one for each bacterial host) and P=columns (one for each phage)
# I will primarily be interested in analyzing the rows, because these will indicate the
# number of phages that can potentially interact within a cell, i.e. coinfect
all_row_sums <- lapply(flores2011, rowSums)
#Now make a data frame to manipulate and plot data
phages_infecting <- data.frame(all_row_sums) #Doesn't work, see below
#Data unequal lengths, maybe this isn't best way to plot, let's work on
# summary statistics in list form and then take those summary stats into a data frame
#First the name of the studies
studies <- names(lapply(all_row_sums, max))
#Also going to use a numeric id for each study, to link with other metadata elsewhere in Flores et al. 2011
id <- 1:length(studies)
#reorganize the list to make a 'tidy' data frame
phages_infecting <- melt(all_row_sums)
#Adjust the names
names(phages_infecting) <- c("phages_infecting", "studies")
#Creating a Summary Data Frame
#number of phages tested?
phage_tested <- as.numeric(lapply(flores2011, ncol))
#number of bacteria tested?
bacteria_tested <- as.numeric(lapply(flores2011, nrow))
#Summary Stats of the number of phage infecting a host, H in Flores et al 2011
min <- as.numeric(lapply(all_row_sums, min))
mean <- as.numeric(lapply(all_row_sums, mean))
max <- as.numeric(lapply(all_row_sums, max))
sd <- as.numeric(lapply(all_row_sums, sd))
sem <- as.numeric(lapply(all_row_sums, sem))
#this is the summary DF for number of coinfecting phages
phages <- data.frame(id, studies, mean, sd, sem, max, min, bacteria_tested, phage_tested)
#Add Numeric ID by doing an inner join with the studies as the ID. This might be interesting.
phages_infecting_id <- inner_join(phages_infecting, subset(phages, select = c(studies, id, bacteria_tested, phage_tested)), by = "studies")
#Actually worked well
names(phages_infecting_id) <- c("phages_infecting", "studies", "ID", "bacteria_tested", "phage_tested")
#Import the metadata
#Import the metadata
#Study metadata is taken from pages 20-21 of Flores et al. 2011 Supplementary PDF
# These pages were OCR'd and manually curated into a CSV 'data/unprocessed/study_metadata.csv'
# A processed version of that file 'data/flores_et_al_2011_metadata_updated.csv'
# was manually curated to consolidate categories and remove blanks.
metadata <- read.csv("data/flores_et_al_2011_metadata_updated.csv")
View(metadata)
#Now another join to get the metadata added
phages_infecting_complete <- inner_join(phages_infecting_id, metadata, by = "ID")
#Since different numbers of phages were tested for each study need to use proportion of tested phage infecting
#to graph and maybe for analysis (more on that below)
phages_infecting_complete <- mutate(phages_infecting_complete, prop_infecting = phages_infecting/phage_tested)
#Now adding metadata to conform to JGI-GOLD Database guidelines and match the Culture Coinfection analyses
#Need to add just ECOSYSTEM, because ENERGY_SOURCE is listed here as Bacterialtrophy already
#Copying values of Isolation_Habitat_alt just to make new row
phages_infecting_complete <- mutate(phages_infecting_complete, ECOSYSTEM = Isolation_Habitat_alt)
#Clear factors so I can add new data
phages_infecting_complete$ECOSYSTEM <- as.character(phages_infecting_complete$ECOSYSTEM)
#Add metadata by study. This was hand curated based on the Flores et al. metadata and the original
# publications. When in conflict, I categorized according to the original reference. Note that these
# categories refer to the *host bacteria* which were not always isolated from the same place as the phages
phages_infecting_complete[phages_infecting_complete$studies == "Abe 2007", "ECOSYSTEM"] <- "Host-associated"
phages_infecting_complete[phages_infecting_complete$studies == "Barrangou 2002", "ECOSYSTEM"] <- "Engineered"
phages_infecting_complete[phages_infecting_complete$studies == "Braun-Breton 1981", "ECOSYSTEM"] <- "Engineered"
phages_infecting_complete[phages_infecting_complete$studies == "Campbell 1995", "ECOSYSTEM"] <- "Host-associated"
phages_infecting_complete[phages_infecting_complete$studies == "Capparelli 2010", "ECOSYSTEM"] <- "Host-associated"
phages_infecting_complete[phages_infecting_complete$studies == "Caso 1995", "ECOSYSTEM"] <- "Engineered"
phages_infecting_complete[phages_infecting_complete$studies == "Ceyssens 2009", "ECOSYSTEM"] <- "Host-associated"
phages_infecting_complete[phages_infecting_complete$studies == "Comeau 2005", "ECOSYSTEM"] <- "Environmental"
phages_infecting_complete[phages_infecting_complete$studies == "Comeau 2006", "ECOSYSTEM"] <- "Environmental"
phages_infecting_complete[phages_infecting_complete$studies == "DePaola 1998", "ECOSYSTEM"] <- "Environmental"
phages_infecting_complete[phages_infecting_complete$studies == "Doi 2003", "ECOSYSTEM"] <- "Engineered"
phages_infecting_complete[phages_infecting_complete$studies == "Duplessis 2001", "ECOSYSTEM"] <- "Engineered"
phages_infecting_complete[phages_infecting_complete$studies == "Gamage 2004", "ECOSYSTEM"] <- "Host-associated"
phages_infecting_complete[phages_infecting_complete$studies == "Goodridge 2003", "ECOSYSTEM"] <- "Host-associated"
phages_infecting_complete[phages_infecting_complete$studies == "Hansen 2007", "ECOSYSTEM"] <- "Host-associated"
phages_infecting_complete[phages_infecting_complete$studies == "Holmfeldt 2007", "ECOSYSTEM"] <- "Environmental"
phages_infecting_complete[phages_infecting_complete$studies == "Kankila 1994", "ECOSYSTEM"] <- "Host-associated"
phages_infecting_complete[phages_infecting_complete$studies == "Krylov 2006", "ECOSYSTEM"] <- "Engineered"
phages_infecting_complete[phages_infecting_complete$studies == "Kudva 1999", "ECOSYSTEM"] <- "Host-associated"
phages_infecting_complete[phages_infecting_complete$studies == "Langley 2003", "ECOSYSTEM"] <- "Environmental"
phages_infecting_complete[phages_infecting_complete$studies == "McLaughlin 2008", "ECOSYSTEM"] <- "Engineered"
phages_infecting_complete[phages_infecting_complete$studies == "Meyer unpub", "ECOSYSTEM"] <- "Engineered"
phages_infecting_complete[phages_infecting_complete$studies == "Middelboe 2009", "ECOSYSTEM"] <- "Environmental"
phages_infecting_complete[phages_infecting_complete$studies == "Miklic 2003", "ECOSYSTEM"] <- "Engineered"
phages_infecting_complete[phages_infecting_complete$studies == "Mizoguchi 2003", "ECOSYSTEM"] <- "Engineered"
phages_infecting_complete[phages_infecting_complete$studies == "Pantucek 1998", "ECOSYSTEM"] <- "Host-associated"
phages_infecting_complete[phages_infecting_complete$studies == "Paterson 2010", "ECOSYSTEM"] <- "Engineered"
phages_infecting_complete[phages_infecting_complete$studies == "Poullain 2008", "ECOSYSTEM"] <- "Engineered"
phages_infecting_complete[phages_infecting_complete$studies == "Quiberoni 2003", "ECOSYSTEM"] <- "Engineered"
phages_infecting_complete[phages_infecting_complete$studies == "Rybniker 2006", "ECOSYSTEM"] <- "Host-associated"
phages_infecting_complete[phages_infecting_complete$studies == "Seed 2005", "ECOSYSTEM"] <- "Host-associated"
phages_infecting_complete[phages_infecting_complete$studies == "Stenholm 2009", "ECOSYSTEM"] <- "Environmental"
phages_infecting_complete[phages_infecting_complete$studies == "Sullivan 2003", "ECOSYSTEM"] <- "Environmental"
phages_infecting_complete[phages_infecting_complete$studies == "Suttle 1993", "ECOSYSTEM"] <- "Environmental"
phages_infecting_complete[phages_infecting_complete$studies == "Synott 2009", "ECOSYSTEM"] <- "Engineered"
phages_infecting_complete[phages_infecting_complete$studies == "Wang 2008", "ECOSYSTEM"] <- "Environmental"
phages_infecting_complete[phages_infecting_complete$studies == "Wichels 1998", "ECOSYSTEM"] <- "Environmental"
phages_infecting_complete[phages_infecting_complete$studies == "Zinno 2010", "ECOSYSTEM"] <- "Engineered"
#Recalculate factors
factor(phages_infecting_complete$ECOSYSTEM)
#### 2. Sources explaining variation in the level of potential coinfection ####
#Analyzing the factors responsible for observed variation in potential coinfection: Can't use the
# number of phages infecting directly, because studies tested different numbers of phages
### 2a. One approach is to conduct an ANOVA directly on the proportions (bear with me before you groan,
# I will address limitations of this approach below)
model_all_prop <- aov(prop_infecting ~ Bacterialtrophy + Majority.source + Bacterial.Association + Isolation_Habitat_new + Bacteria_new, data = phages_infecting_complete)
summary(model_all_prop)
# Df Sum Sq Mean Sq F value Pr(>F)
# Bacterialtrophy 1 1.62 1.6173 19.373 1.19e-05 ***
# Majority.source 2 4.63 2.3165 27.749 1.91e-12 ***
# Bacterial.Association 7 8.40 1.2005 14.381 < 2e-16 ***
# Isolation_Habitat_new 22 29.29 1.3315 15.951 < 2e-16 ***
# Bacteria_new 3 1.83 0.6115 7.325 7.38e-05 ***
# Residuals 969 80.89 0.0835
summary.lm(model_all_prop)
#Residual standard error: 0.2889 on 969 degrees of freedom
#Multiple R-squared: 0.3614, Adjusted R-squared: 0.3384
#F-statistic: 15.67 on 35 and 969 DF, p-value: < 2.2e-166
(summary.aov(model_all_prop)[[1]]$'Sum Sq' / sum(summary.aov(model_all_prop)[[1]]$'Sum Sq'))*100
# Trophy #Source #Bact.Assoc #Isolation.Hab #Taxa Residuals
#Percentage explained: 1.276707 3.657382 6.634173 23.125500 1.448197 63.858042
#Reduced model
model_all_prop <- step(model_all_prop)
summary(model_all_prop)
# Df Sum Sq Mean Sq F value Pr(>F)
# Bacterial.Association 7 10.65 1.5214 18.24 < 2e-16 ***
# Isolation_Habitat_new 22 30.85 1.4024 16.81 < 2e-16 ***
# Bacteria_new 5 4.26 0.8518 10.21 1.5e-09 ***
# Residuals 970 80.91 0.0834
#---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#Residual standard error: 0.2888 on 970 degrees of freedom
#Multiple R-squared: 0.3613, Adjusted R-squared: 0.3389
#F-statistic: 16.14 on 34 and 970 DF, p-value: < 2.2e-16
#Extract Percentage of Variance Explained
(summary.aov(model_all_prop)[[1]]$'Sum Sq' / sum(summary.aov(model_all_prop)[[1]]$'Sum Sq'))*100
# #Bact.Assoc #Isolation.Hab #Taxa #Residuals
#Percentage explained: 8.407191 24.356492 3.362131 63.874187
plot(model_all_prop)
#These diagnostic plots actually look pretty good. Now get all plots in one.
#Figure S1. ANOVA Plot diagnostics
par(mfrow=c(2,2))
plot(model_all_prop)
### 2b. Objections to ANOVA analysis on proportions say arc-sine transform can take care of issues
model_all_prop_arcsin <- lm(asin(prop_infecting) ~ Bacterialtrophy + Majority.source + Bacterial.Association + Isolation_Habitat_new + Bacteria_new, data = phages_infecting_complete)
summary.aov(model_all_prop_arcsin)
# Df Sum Sq Mean Sq F value Pr(>F)
# Bacterialtrophy 1 3.04 3.040 17.98 2.44e-05 ***
# Majority.source 2 16.12 8.060 47.67 < 2e-16 ***
# Bacterial.Association 7 16.42 2.346 13.88 < 2e-16 ***
# Isolation_Habitat_new 22 70.49 3.204 18.95 < 2e-16 ***
# Bacteria_new 3 2.77 0.923 5.46 0.00101 **
# Residuals 969 163.84 0.169
#---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
summary.lm(model_all_prop_arcsin)
#Residual standard error: 0.4112 on 969 degrees of freedom
#Multiple R-squared: 0.3992, Adjusted R-squared: 0.3775
#F-statistic: 18.39 on 35 and 969 DF, p-value: < 2.2e-16
#Extract Percentage of Variance Explained
(summary.aov(model_all_prop_arcsin)[[1]]$'Sum Sq' / sum(summary.aov(model_all_prop_arcsin)[[1]]$'Sum Sq'))*100
#Percentage explained: 1.114948 5.911753 6.023291 25.850664 1.015682 60.083662
#Slightly more variance explained than prop model
plot(model_all_prop)
#These diagnostic plots look a little better than without the transform
### 2c. Now the "right" way to do this is a logistic regression because I have counts and totals
#Make vector of failures (to infect)
failures <- phages_infecting_complete$phage_tested - phages_infecting_complete$phages_infecting
#Model
glm_all <- glm(cbind(phages_infecting, failures) ~ Bacterialtrophy + Majority.source + Bacterial.Association + Isolation_Habitat_new + Bacteria_new, family = binomial, data = phages_infecting_complete)
summary(glm_all)
anova(glm_all, test = "Chisq")
#Analysis of Deviance Table
#Model: binomial, link: logit
#Response: cbind(phages_infecting, failures)
#Terms added sequentially (first to last)
#Df Deviance Resid. Df Resid. Dev Pr(>Chi)
#NULL 1004 6033.6
# Bacterialtrophy 1 69.29 1003 5964.3 < 2.2e-16 ***
# Majority.source 2 45.70 1001 5918.6 1.194e-10 ***
# Bacterial.Association 7 506.57 994 5412.1 < 2.2e-16 ***
# Isolation_Habitat_new 22 1348.66 972 4063.4 < 2.2e-16 ***
# Bacteria_new 3 149.74 969 3913.7 < 2.2e-16 ***
#Model is overdispersed, so using quasibinomial family, per Crawley
# source http://www.math.chs.nihon-u.ac.jp/~tanaka/files/kenkyuu/ProportionData.pdf
glm_all_quasibinaomial <- glm(cbind(phages_infecting, failures) ~ Bacterialtrophy + Majority.source + Bacterial.Association + Isolation_Habitat_new + Bacteria_new, family = quasibinomial, data = phages_infecting_complete)
summary(glm_all_quasibinaomial)
anova(glm_all_quasibinaomial, test = "Chisq")
#Analysis of Deviance Table
#Model: quasibinomial, link: logit
#Response: cbind(phages_infecting, failures)
#Terms added sequentially (first to last)
# Df Deviance Resid. Df Resid. Dev Pr(>Chi)
# NULL 1004 6033.6
# Bacterialtrophy 1 69.29 1003 5964.3 1.174e-05 ***
# Majority.source 2 45.70 1001 5918.6 0.001777 **
# Bacterial.Association 7 506.57 994 5412.1 < 2.2e-16 ***
# Isolation_Habitat_new 22 1348.66 972 4063.4 < 2.2e-16 ***
# Bacteria_new 3 149.74 969 3913.7 5.118e-09 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#Fit of the model
with(glm_all_quasibinaomial, null.deviance - deviance)
#[1] 2378.563
with(glm_all_quasibinaomial, df.null - df.residual)
#[1] 35
with(glm_all_quasibinaomial, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
#[1] 0
#### 3. Plotting factors affecting the proportion of tested phage infecting hosts ####
#Isolation Habitat
#Isolation Habitat ("Isolation_Habitat_new" = corrected: inserted missing category see data sheet)
gr1 <- ggplot(data=phages_infecting_complete, aes(x=Isolation_Habitat_new, y=prop_infecting)) +
geom_boxplot() +
geom_point(aes(colour=studies), position=position_jitter(width=0.2, height=0.02), alpha=0.25) +
coord_flip() + theme(legend.position="none") + xlab("Isolation Habitat") + ylab("Prop. of tested phage infecting") #use this one
#Isolation Habitat alternate classification collapsing too many categories
ggplot(data=phages_infecting_complete, aes(x=Isolation_Habitat_alt, y=prop_infecting)) +
geom_boxplot() +
geom_point(aes(colour=studies), position=position_jitter(width=0.2, height=0.02), alpha=0.75) +
coord_flip() + theme(legend.position="none")
#Bacterial Association
gr2 <- ggplot(data=phages_infecting_complete, aes(x=Bacterial.Association, y=prop_infecting)) +
geom_boxplot() +
geom_point(aes(colour=studies), position=position_jitter(width=0.2, height=0.02), alpha=0.25) +
coord_flip() + theme(legend.position="none") + xlab("Bacterial Association") + ylab("Prop. of tested phage infecting")
#Don't use Bacterial_Association_new, previous categories seem adequate
#Type of Study (Majority Source)
gr3 <- ggplot(data=phages_infecting_complete, aes(x=Majority.source, y=prop_infecting)) +
geom_boxplot() +
geom_point(aes(colour=studies), position=position_jitter(width=0.3, height=0.02), alpha=0.25) +
coord_flip() + theme(legend.position="none") + xlab("Study Type Source") + ylab("Prop. of tested phage infecting")
#Bacterial Trophy
gr4 <- ggplot(data=phages_infecting_complete, aes(x=Bacterialtrophy, y=prop_infecting)) +
geom_boxplot() +
geom_point(aes(colour=studies), position=position_jitter(width=0.3, height=0.02), alpha=0.25) +
coord_flip() + theme(legend.position="none") +xlab("Bacterial Trophy") + ylab("Prop. of tested phage infecting")
#Bacteria
ggplot(data=phages_infecting_complete, aes(x=Bacteria, y=prop_infecting)) +
geom_boxplot() +
geom_point(aes(colour=studies), position=position_jitter(width=0.1, height=0.02), alpha=0.75) +
coord_flip() + theme(legend.position="none")
#Bacteria, cleaning up and consolidating categories
gr5 <- ggplot(data=phages_infecting_complete, aes(x=Bacteria_new, y=prop_infecting)) +
geom_boxplot() +
geom_point(aes(colour=studies), position=position_jitter(width=0.1, height=0.02), alpha=0.25) +
coord_flip() + theme(legend.position="none") +
xlab("Bacterial Taxa") + ylab("Prop. of tested phage infecting") #This seems like the most adequate way to describe variability
#Bacteria, using families instead
ggplot(data=phages_infecting_complete, aes(x=Bacteria_family, y=prop_infecting)) +
geom_boxplot() +
geom_point(aes(colour=studies), position=position_jitter(width=0.1, height=0.02), alpha=0.75) +
coord_flip() + theme(legend.position="none")
#All plots full and reduced model Figure S2
#Bring plots together. Requires package GridExtra
grid.arrange(gr1, gr2, gr5, gr3, gr4, nrow = 2)
#Now draw both ANOVA tables for the remaining open panel
full_table <-anova(lm(prop_infecting ~ Bacterialtrophy + Majority.source +
Bacterial.Association + Isolation_Habitat_new + Bacteria_new,
data = phages_infecting_complete))
ftable <- tableGrob(round(full_table, digits=4))
#Now with step-AIC reduced model
reduced_table <-anova(lm(prop_infecting ~ Bacterial.Association + Isolation_Habitat_new +
Bacteria_new, data = phages_infecting_complete))
rtable <- tableGrob(round(reduced_table, digits=4))
grid.arrange(ftable, rtable, nrow=2)
#Only reduced model Plots for Figure 1
grid.arrange(gr1, gr2, gr5, nrow = 2)
reduced_table <-anova(lm(prop_infecting ~ Bacterial.Association + Isolation_Habitat_new + Bacteria_new, data = phages_infecting_complete))
rtable <- tableGrob(round(reduced_table, digits=4))
grid.arrange(rtable)