项目作者: carleton-spacehogs

项目描述 :
A repository as a Supplementary Methods of Moulana et al., 2019
高级语言: Python
项目地址: git://github.com/carleton-spacehogs/pangenome-selection.git
创建时间: 2019-07-16T19:39:09Z
项目社区:https://github.com/carleton-spacehogs/pangenome-selection

开源协议:

下载


Processing Data and Generating Figures for Moulana et al.

Alief Moulana

To perform the analyses, please download or clone this repository, and open pangenome-selection.Rmd. Codes can be run remotely and match the resulting data and figures as shown here and in pangenome-selection.md.

  1. knitr::opts_chunk$set(echo = TRUE)
  2. #load all necessary packages
  3. library(reshape2)
  4. library(ggplot2)
  5. library(tidyverse)
  6. library(tidyr)
  7. library(dplyr)
  8. library(stringr)
  9. library(factoextra)
  10. library(RColorBrewer)
  11. library(tm)
  12. #load all the data needed
  13. pangenome <- as.matrix(read.table('aux_files/similarity_matrix.txt',
  14. quote = "",sep='\t',header=FALSE))
  15. #load the similarity matrix data
  16. cluster<- read.table('aux_files/Sulfurovum_gene_clusters_summary_cleaned.txt',sep='\t',quote="",header=TRUE)
  17. #load the summarized sulfurovum pangenome profile from anvi'o

Taxonomy distribution and pangenomic profile

We performed the common metagenomic pipeline of assembly, mapping,
binning, and annotation (see Methods; refer to citations). We then
analyzed the taxonomic composition in each region using Excel
(Supplementary Figure 1; from Taxonomy_data.xlsx). Because Sulfurovum
was the most abundant genus in both Mid Cayman Rise and Axial genomes,
we used Sulfurovum MAGs in the subsequent pangenome analyses following
“anvi’o” pangenome workflow. Then we captured Figure 1A using command
anvi-display-pan with genomes storage and pangenome databases
generated from the previous steps. Moreover, we obtained the summary of
the pangenome profile (the cluster file
Sulfurovum_gene_clusters_summary_cleaned.txt) by running the command
anvi-summarize for the PAN DB. We also analyzed how the pattern of
total and core genome accumulation as more MAGs considered. We visualize
the result from the chunk below (Supplementary Figure 2).

  1. cluster.basic.count <- unique(data.frame(Group=cluster$gene_cluster_id,
  2. Number=cluster$num_genomes_gene_cluster_has_hits,
  3. Genome=cluster$genome_name))
  4. # create df, remove duplicates
  5. cluster.basic.count$Group <- as.character(cluster.basic.count$Group)
  6. #reshape df
  7. cluster.basic.reshaped <- reshape(cluster.basic.count,v.names="Number",
  8. timevar="Genome",idvar=c("Group"),direction="wide")
  9. cluster.basic.matrix <- as.matrix(cluster.basic.reshaped[,2:23]) #turn into matrix
  10. cluster.basic.matrix[] <- c(TRUE,FALSE)[(is.na(cluster.basic.matrix) )+ 1] #binary
  11. total.each.genome <- apply(cluster.basic.matrix,2,sum) #numb of gene groups per genome
  12. # create a function returning whether or not the sum of a row up until i equals i
  13. count.sum <- function(row,i){
  14. sum(row[1:i]) == i
  15. }
  16. # create a vector that count the number of core genes taking into account i genomes
  17. core.now <- c()
  18. for(j in 1:22){
  19. core.now[j] <- sum(apply(cluster.basic.matrix,1,count.sum,j))
  20. }
  21. # create a function to perform the analysis below
  22. count.sum.2 <- function(numb,x){
  23. var <- sum(apply(as.matrix(cluster.basic.matrix[,1:x-1]),1,sum) ==0 &
  24. cluster.basic.matrix[,x] == 1)
  25. return(numb+var)
  26. }
  27. # create a vector that expands number of genes as more genomes sequenced
  28. total.now <- c()
  29. total.now[1] <- total.each.genome[1]
  30. count <- total.now[1]
  31. for(k in 2:22){
  32. total.now[k] <- count.sum.2(count,k)
  33. count <- total.now[k]
  34. }
  35. # merge the core now and total now
  36. count.groups <- data.frame(Number=c(1:22),Core=core.now,Total=total.now)
  37. ggplot(count.groups,aes(x=Number))+
  38. geom_line(aes(y=Core,colour="Number of Core Genes"))+
  39. geom_line(aes(y=Total/5,colour="Number of Total Genes"))+
  40. scale_y_continuous(sec.axis = sec_axis(~.*5, name = "Number of Total Genes"))+
  41. xlab("Number of MAGs Recovered") +
  42. ylab("Number of Core Genes") +
  43. labs(colour = "Parameter")+
  44. theme_bw()+
  45. theme(axis.text=element_text(size=12),
  46. legend.text=element_text(size=12),
  47. legend.title=element_blank(),
  48. axis.title=element_text(size=14),
  49. legend.position = c(0.3, 0.85))

Moreover, we also analyze the similarity in gene content among MAGs. We
first run similarity_matrix.py which takes the cluster file as an
input and produces the file similarity_matrix.txt. The program
produces a 22-by-22 matrix, where each entry (ij) represents the
proportion of genes in the (i)th MAG that is also contained by the
(j)th MAG. We then visualize the matrix and find the distance among
the genomes in the following chunk.

\<\<\<\<\<\<\< HEAD

  1. # create genome names
  2. twenty_two <- as.character(c(1:22))
  3. genome_numbering <- ifelse(nchar(twenty_two) == 1,
  4. paste(as.character(0),twenty_two,sep=""),
  5. as.character(twenty_two))
  6. genome_names <- paste("Sulfurovum",genome_numbering,sep="_")
  7. # then manipulate the data for the heatmap where each square represents the percentage of genes in genome 1 contained in genome 2
  8. heat_map.data <- data.frame(pangenome)
  9. colnames(heat_map.data) <- genome_names
  10. heat_map.data$genome <- genome_names
  11. # melt the data from a large matrix to pairwise rows
  12. heat_map.data.melt <- melt(heat_map.data, id=c("genome"))
  13. colnames(heat_map.data.melt)<-c("MAG_1", "MAG_2", "Contained")
  14. # cluster the data based on the genome content overlap
  15. a <- hclust(as.dist(1-heat_map.data[, -23]))
  16. a$labels <- genome_numbering
  17. library(factoextra)
  18. dend_plot <- fviz_dend(a)
  19. dend_plot # this gets the dendrogram

  1. order <- a$order
  2. # order the MAGs based on the clustering
  3. heat_map.data.melt$MAG_1 <- factor(heat_map.data.melt$MAG_1, levels=ifelse(order < 10, paste0("Sulfurovum_0", order), paste0("Sulfurovum_", order)))
  4. heat_map.data.melt$MAG_2 <- factor(heat_map.data.melt$MAG_2, levels=ifelse(order < 10, paste0("Sulfurovum_0", order), paste0("Sulfurovum_", order)))
  5. # Fig 1B
  6. # plot the heatmap
  7. ggplot(data = heat_map.data.melt, aes(x = MAG_1, y = MAG_2)) +
  8. geom_tile(aes(fill = Contained))+
  9. scale_fill_gradientn(colours = c("white", "lightblue", "darkblue"), values = c(0,0.5,1))+
  10. theme(axis.text.x = element_text(angle = 90, hjust = 1))

Because our pangenome is created upon metagenomes, we need to be careful
when declaring a gene to be in the core genome. We simulate the
probability that a gene found in (n) genomes to be actually a core
genome, exlcluding genes found in 22 genomes.

  1. # summary file created based on `anvi-summarize` output
  2. summary_data <- read.table('aux_files/sulfurovum_summary.txt',
  3. sep='\t',header=TRUE,quote="",na.strings = "",fill = TRUE )
  4. # function that takes in a number i and outputs the probability that a gene is a missing gene in the ith genome
  5. gene.chance<-function(i){
  6. data.selected <- summary_data[i,] # look at the ith row of data
  7. mean.complete <- 100*data.selected$number_genes/data.selected$completion # approximate total number of genes
  8. sd.complete <- -(mean.complete-data.selected$number_genes)/(qnorm(data.selected$redundancy/100)) # approximate standard deviation
  9. missing.genes <- rnorm(1,mean.complete,sd.complete) # total number of genes supossedly under normal distribution
  10. return(missing.genes/(10263-data.selected$number_genes)) # probability a gene outside this genome is a missing gene in this genome
  11. }
  12. # run the simulation (pretty long)
  13. collection <- data.frame(MAGs=as.factor(c()),
  14. Probability=as.numeric(c()))
  15. N<-100
  16. for (i in 1:21){
  17. for (j in 1:N){
  18. sampled <- as.matrix(sample(1:22, i, replace=F)) # sample i genomes without replacement
  19. missing <- apply(sampled,1,gene.chance) # row by row apply function above
  20. collection <- rbind(collection,c(22-i,prod(missing))) # ML by calculate the product
  21. }
  22. }
  23. colnames(collection) = c("MAGs","Probability")
  24. collection$MAGs <- as.factor(collection$MAGs)
  25. # Supplement figure for probability
  26. ggplot(collection,mapping=aes(x=MAGs,y=Probability))+
  27. scale_y_log10()+
  28. geom_boxplot()+
  29. theme_bw()+
  30. theme(axis.text=element_text(size=12),
  31. legend.text=element_text(size=12),
  32. legend.title=element_blank(),
  33. axis.title=element_text(size=14),
  34. )+
  35. labs(x="Number of MAGs",
  36. y="Probability of Core")+
  37. geom_hline(yintercept=0.05,color="red")

The distinct functions between high- and low-frequency genes

We then analyzed the gene annotations and studied the annotation
distribution across gene frequency. To get the count table for category
count across number of MAGs, which is in the output file
COG_category.txt, run get_functions.py.

  1. # data and manipulation
  2. COG <- na.omit(read.table('aux_files/Sulfurovum_function_list.txt',
  3. sep='\t',header=TRUE))
  4. COG <- arrange(COG,Category)
  5. Categories <- read.table('aux_files/COG_explanation.txt',sep='\t',header=FALSE)
  6. colnames(COG) <- c('Category',as.character(c(1:22)))
  7. COG_melt <- melt(COG, id=c("Category"))
  8. # some color palette functions
  9. colourCount = length(unique(COG$Category))
  10. getPalette = colorRampPalette(brewer.pal(9, "Set1"))
  11. load("permutation_test.RData") #get the permuation test result
  12. colnames(p.df) <- c(1:22)
  13. p.df$categories <- factor(COG$Category, levels=COG$Category[order])
  14. p.df$explanation <- factor(Categories$V2, levels=Categories$V2[order])
  15. p.df.melt <- melt(p.df, id=c("categories","explanation"))
  16. #p.values significy the frequency such that the observed values are greater than or equal to the simulated values
  17. #get distance matrix between categories
  18. n.categories <- 25
  19. n.genomes <- 22
  20. distance.matrix <- matrix(0,ncol=n.categories,nrow=n.categories)
  21. for(i in 1:n.categories){
  22. for(j in 1:n.categories){
  23. current <- 0
  24. for(k in 1:n.genomes){
  25. current <- current + (p.matrix[i,k]-p.matrix[j,k])^2
  26. }
  27. distance.matrix[i,j]<-current
  28. }
  29. } #calculate the sum of squared distance in p-values between categories
  30. colnames(distance.matrix) <- COG$Category
  31. rownames(distance.matrix) <- COG$Category
  32. p.cluster <- hclust(as.dist(distance.matrix)) #hierarchical clustering
  33. library(factoextra)
  34. p.dendrogram <- fviz_dend(p.cluster,show_labels = TRUE)
  35. p.dendrogram #get the dendrogram

  1. category.vector <- as.vector(COG$Category)
  2. explanation.vector <- as.vector(p.df$explanation) #explanation vector
  3. order <- p.cluster$order
  4. p.df.melt$explanation <- factor(p.df$explanation, levels=explanation.vector[order])
  5. COG_melt$Category <- factor(COG_melt$Category, levels=COG_melt$Category[order])
  6. # Fig 2A that shows the trend of COG categories across gene frequency
  7. ggplot(COG_melt,aes(x=variable, y=value, fill=Category))+
  8. geom_bar( stat="identity", position="fill")+
  9. xlab("Number of MAGs") +
  10. ylab("Gene Category Count") +
  11. scale_fill_manual(values = getPalette(colourCount)) +
  12. theme_bw()+
  13. theme(axis.text=element_text(size=12),
  14. legend.text=element_text(size=12),
  15. legend.title=element_blank(),
  16. axis.title=element_text(size=14),
  17. legend.position = "right")

P-value figure as the following.

  1. # for p_value (Supplement)
  2. ggplot(data = p.df.melt, aes(x = variable, y = categories)) +
  3. geom_tile(data=subset(p.df.melt,value < 0.001),aes(fill = (value*10000)-20))+
  4. geom_tile(data=subset(p.df.melt,value > 0.999),aes(fill = ((value-1)*10000)+20))+
  5. geom_tile(data=subset(p.df.melt,value >= 0.001 & value <= 0.999),aes(fill = (value-0.5)/100))+
  6. scale_fill_gradient2(limits=c(-20, 20),guide = "colourbar",
  7. low = "darkred", mid= "white", high = "darkblue",midpoint=0)+
  8. xlab("Number of MAGs (Frequency)") +
  9. guides(fill=guide_legend(title="P(Observed > Expected)"))+
  10. theme(plot.title = element_text(hjust = -0.4),
  11. plot.margin = rep(grid::unit(0.75,"in"),4))+
  12. theme(axis.text.x = element_text(size=12),
  13. axis.text.y=element_text(size=12),
  14. legend.text=element_text(size=12),
  15. legend.position="none",
  16. axis.title=element_text(size=14),
  17. title=element_text(size=14),
  18. axis.title.y=element_blank())

We then analyzed specific categories

  1. # melting and data manipulation
  2. COG_prop <- data.frame(prop.table(as.matrix(COG[,-1]), margin = 2))
  3. COG_prop$Group <- COG$Category
  4. COG_prop$Explanation <- Categories$V2
  5. COG_prop$Group <- factor(COG_prop$Group, levels=COG_prop$Group[order])
  6. COG_prop$Explanation <- factor(COG_prop$Explanation,
  7. levels=COG_prop$Explanation[order])
  8. COG_prop.melt <- melt(COG_prop,id=c("Group","Explanation"))
  9. COG_prop.melt$variable <- rep(1:22,each=25)
  10. # here we make sure that the color matches the color in Fig 2A
  11. Color_select <- data.frame(Group= levels(COG_prop$Group),
  12. Color= getPalette(colourCount))
  13. concerned.group <- c('J','E','M','P')
  14. concerned.color <- as.vector(Color_select[Color_select$Group %in% concerned.group,]$Color)
  15. # Fig 2B
  16. ggplot(COG_prop.melt[COG_prop.melt$Group %in% concerned.group,],aes(x=variable,y=value))+
  17. geom_bar(stat="identity")+
  18. theme_bw()+
  19. xlab("Number of MAGs")+
  20. ylab("Proportion")+
  21. theme(axis.text=element_text(size=12),
  22. legend.position="none",
  23. axis.title=element_text(size=14),
  24. strip.text = element_text(size=10))+
  25. facet_wrap(~Explanation,ncol=2)+
  26. aes(fill=Group)+
  27. scale_fill_manual(values=concerned.color)

We then calculated the enrichment of a gene group in one region vs. the
other by assuming binomial distribution for each gene group. First, run
parse_vent_function.py to calculate the occurrence of each gene
cluster in either of the vent regions, which then gives an output of
Sulfurovum_vent_count_per_cluster.txt. Then, run
compare_mcr_axial.py 12 8 to find the binomial CDF for gene clusters
found in Axial genomes only for cluster that are found in at least all
but one MAGs of either Axial or MCR, which generates
Sulfurovum_Axial_enriched_compressed.txt. We process the data
below.

  1. enrichment<-na.omit(read.table('aux_files/Sulfurovum_Axial_enriched_compressed.txt',
  2. sep='\t',header=TRUE,quote="",na.strings = "")) # load the data
  3. # the jittery plot in Figure 3
  4. ggplot(data = enrichment, mapping = aes(x=Category, y=Enrichment)) +
  5. geom_boxplot(alpha = 0,na.rm = TRUE) +
  6. geom_jitter(alpha = 0.3, aes(colour = Category),na.rm=TRUE,size=3)+
  7. geom_hline(yintercept=0.05, linetype=2, color = "red", size=1)+
  8. geom_hline(yintercept=0.95, linetype=2, color = "blue", size=1)+
  9. theme_bw()+
  10. ylab("Binomial CDF for Proportion in Axial")+
  11. theme(axis.text=element_text(size=12),
  12. axis.title=element_text(size=14),
  13. legend.position = "none")

  1. # now only look at the P category to see the difference from others
  2. enrichment.P <- subset(enrichment,Category == 'P') # extract this into Table 1 (for Top 15 with lowest CDF score)
  3. enrichment.Non <- subset(enrichment,Category != 'P')
  4. t.test(enrichment.P$Enrichment,enrichment.Non$Enrichment,alternative="less")
  1. ##
  2. ## Welch Two Sample t-test
  3. ##
  4. ## data: enrichment.P$Enrichment and enrichment.Non$Enrichment
  5. ## t = -4.4981, df = 45.083, p-value = 2.388e-05
  6. ## alternative hypothesis: true difference in means is less than 0
  7. ## 95 percent confidence interval:
  8. ## -Inf -0.1070381
  9. ## sample estimates:
  10. ## mean of x mean of y
  11. ## 0.2721944 0.4430042

Signatures of non-neutral evolution in the pangenome

In this section, we first calculated the pN/pS ratios of each gene (not
gene cluster) in the pangenome. To do so, we used
anvi-script-calculated-pn-ps-ratio as described in Methods (we ran
get-variability-profile-scv.py and get-pn-ps.sh). Note that, we did
this step for each sample, so we need to create a summary file for all
samples using summarize_pn_pn_sulfurovum.py (have to be run with all
the processed data), giving us
Sulfurovum_pn_ps_summary.txt.

  1. dat<-read.table('aux_files/Sulfurovum_pn_ps_summary.txt',sep='\t',header=TRUE,na.strings="Inf")
  2. dat_counts<-na.omit(dat)
  3. dat_counts<-dat_counts[is.finite(dat_counts$pN_pS),]
  4. dat_counts$num_genomes <- factor(dat_counts$num_genomes)
  5. dat_counts$type <- factor(ifelse(dat_counts$num_genomes == 22, "N=22", ifelse(dat_counts$num_genomes %in% 1:19,"0<N<20",NA))) #arbitrady core and accessory naming
  6. # also look across categories by doing this:
  7. load("dat_category.RData")
  8. # gives Fig 4A
  9. ggplot(data = dat_counts, mapping = aes(x=num_genomes, y=pN_pS)) +
  10. geom_boxplot(alpha = 0.3,na.rm = TRUE,outlier.size = 1,aes(colour=num_genomes)) +
  11. #geom_jitter(alpha = 0.3, aes(colour = num_genomes),na.rm=TRUE,size=1)+
  12. geom_hline(yintercept=1, linetype=2, color = "rosybrown", size=1)+
  13. xlab("Number of MAGs") +
  14. ylab("pN/pS") +
  15. theme_bw()+
  16. theme(axis.text=element_text(size=12),
  17. axis.title=element_text(size=14),
  18. legend.position = "none")

</N<20”,NA)))>

  1. # we then find the Wilcoxon pairwise p-value for each Number of MAGs, then we also find the mean.
  2. allthe.p <- c()
  3. mean.p <- c()
  4. sd.p <- c()
  5. for(i in 1:22){
  6. dat_counts$type01 <- ifelse(dat_counts$num_genomes==i,'yes','no')
  7. mean.p[i] <- mean(dat_counts[dat_counts$num_genomes==i,]$pN_pS)
  8. sd.p[i] <- sd(dat_counts[dat_counts$num_genomes==i,]$pN_pS)
  9. wiwi <- wilcox.test(pN_pS ~ type01, data = dat_counts)
  10. allthe.p[i] <- wiwi$p.value
  11. }
  12. p_value.pN_pS <- data.frame(Number= as.factor(c(1:22)),P= allthe.p, Mean = mean.p,
  13. Sd = sd.p)
  14. # gives the supplementary figure for the p-value
  15. ggplot(data=p_value.pN_pS,aes(x=Number,y=P))+
  16. geom_point(size=3)+
  17. xlab("Number of MAGs") +
  18. ylab("Wilcoxon P-value") +
  19. scale_y_log10()+
  20. theme_bw()+
  21. theme(axis.text=element_text(size=12),
  22. axis.title=element_text(size=14),
  23. legend.position = "none")

  1. # gives the inset in Figure 4A for mean
  2. ggplot(data=p_value.pN_pS,aes(x=Number,y=Mean))+
  3. geom_point(size=3)+
  4. xlab("Number of MAGs") +
  5. ylab("Mean of pN/pS") +
  6. theme_bw()+
  7. theme(axis.text=element_text(size=12),
  8. axis.title=element_text(size=24),
  9. legend.position = "none")

  1. # gives the supplementary for standard deviation
  2. ggplot(data=p_value.pN_pS,aes(x=Number,y=Sd))+
  3. geom_point(size=3)+
  4. xlab("Number of MAGs") +
  5. ylab("Standard Deviation\nof pN/pS") +
  6. theme_bw()+
  7. theme(axis.text=element_text(size=12),
  8. axis.title=element_text(size=14),
  9. legend.position = "none")

  1. # supplementary for genome name
  2. ggplot(data = dat_counts, mapping = aes(x=gsub("Sulfurovum_","",genome_name),
  3. y=pN_pS)) +
  4. geom_boxplot(alpha = 0.3,na.rm = TRUE,outlier.size = 1,aes(colour=genome_name)) +
  5. geom_hline(yintercept=1, linetype=2, color = "rosybrown", size=1)+
  6. xlab("MAG") +
  7. ylab("pN/pS") +
  8. theme_bw()+
  9. theme(axis.text=element_text(size=12),
  10. axis.title=element_text(size=14),
  11. legend.position = "none")

  1. # gives Fig 4B
  2. ggplot(data = na.omit(dat_counts), mapping = aes(x=type, y=pN_pS)) +
  3. geom_boxplot(alpha = 0.3,na.rm = TRUE,outlier.size = 1,aes(colour=type)) +
  4. geom_hline(yintercept=1, linetype=2, color = "rosybrown", size=1)+
  5. xlab("Number of MAGs") +
  6. ylab("pN/pS") +
  7. theme_bw()+
  8. theme(axis.text=element_text(size=12),
  9. axis.title=element_text(size=14),
  10. legend.position = "none")

  1. # now the supplementary figure for data across categories
  2. ggplot(data = dat_category, mapping = aes(x=category, y=pN_pS)) +
  3. geom_boxplot(alpha = 0.3,na.rm = TRUE, aes(colour = category),outlier.size=1) +
  4. #geom_jitter(alpha = 0.3, aes(colour = category),na.rm=TRUE,size=1)+
  5. geom_hline(yintercept=1, linetype=2, color = "rosybrown", size=1)+
  6. xlab("Category") +
  7. ylab("pN/pS") +
  8. theme_bw()+
  9. theme(axis.text=element_text(size=12),
  10. axis.title=element_text(size=14),
  11. legend.position = "none")

Then, we load the data from the selective sweep study. We obtained this
data from running pre_poisson_test.py which utilizes the SNV profile
files generated from anvi’o script. For each sample, we got a Poisson
CDF score file, which we later collected using collect_poisson.py
which outputs Sulfurovum_poisson_sweep.txt.

  1. # load, merge, and manipulate data
  2. data<-read.table('aux_files/Sulfurovum_poisson_sweep.txt',sep='\t',header=TRUE)
  3. data$numb <- factor(data$numb)
  4. data <- na.omit(data)
  5. data$pv_val_norm <- -log10(data$pv_val)
  6. data <- arrange(data,unique_id)
  7. data$pN_pS <- dat$pN_pS
  8. data$order <- seq(1:length(data[,1]))
  9. data$genome_name <- cluster$genome_name
  10. data$cluster_id <- cluster$gene_cluster_id
  11. data$category <- cluster$COG_CATEGORY
  12. data$func <- cluster$COG_FUNCTION
  13. # first, we still process the pN_pS data for the phosphate related genes
  14. phosphate.clusters <- c("GC_00001177",
  15. "GC_00001253",
  16. "GC_00001293",
  17. "GC_00001150",
  18. "GC_00001267") #the interesting phosphate gene clusters
  19. data.P <- na.omit(data[data$cluster_id %in% phosphate.clusters,])
  20. data.P$unique_id <- as.factor(data.P$unique_id)
  21. ggplot(data.P,mapping = aes(x=unique_id, y=pN_pS, fill=func)) +
  22. geom_bar(stat='identity') +
  23. theme_bw()+
  24. xlab("Gene") +
  25. ylab("pN/pS Ratio") +
  26. theme(axis.text=element_text(size=12),
  27. axis.text.x=element_blank(),
  28. legend.text=element_text(size=12),
  29. legend.title=element_blank(),
  30. axis.title=element_text(size=14),
  31. legend.position = c(0.55,0.8))

Then, we performed the actual selective sweep analyses. The data for
total number of SNVs in each MAG is directly extracted from the SNV
profiled files provided by anvi’o.

  1. # first, we consider the SNV density in each MAG
  2. SNV_file <- read.csv("aux_files/normalized_snv.csv")
  3. SNV_file$ID <- gsub('Sulfurovum_', '', as.vector(SNV_file$genome_id))
  4. # Fig 5A
  5. ggplot(SNV_file,mapping=aes(x=ID,y=norm_SNV,fill=vent_field))+
  6. geom_col()+
  7. xlab("Sulfurovum MAG") +
  8. ylab("Normalized SNV Count \n (#SNVs/kbp)") +
  9. theme_bw()+
  10. theme(axis.text=element_text(size=12),
  11. legend.text=element_text(size=12),
  12. axis.title=element_text(size=14),
  13. legend.title=element_blank(),
  14. legend.position=c(0.15,0.68))

  1. # now analyze the gene-specific sweeps
  2. numb.numb <- nlevels(data$numb)
  3. proportion <- rep(0,numb.numb)
  4. # p-values < 1e-10
  5. for (i in 1:numb.numb){
  6. proportion[i] = sum(data[data$pv_val_norm>10,]$numb==i,na.rm=TRUE)/
  7. sum(cluster$num_genomes_gene_cluster_has_hits==i,na.rm=TRUE)
  8. }
  9. # the plot for proportion of genes with p-values < 1e-10 (Supplementary)
  10. ggplot(mapping=aes(x=1:22,y=proportion))+
  11. geom_point(size=3)+
  12. xlab("Number of MAGs") +
  13. ylab("Proportion of Genes \n with P < 1e-10") +
  14. theme_bw()+
  15. theme(axis.text=element_text(size=12),
  16. axis.title=element_text(size=14),
  17. legend.position="none")

  1. # plot for P-values vs. SNV
  2. data$SNV.p <- ifelse(data$SNV==0,0.5,data$SNV)
  3. data_input <- data[data$numb %in% c(1,4,10,15,18,22),]
  4. data_input$label <- paste("No. of MAGs = ",as.character(data_input$numb), sep="")
  5. data_input$label <- as.factor(data_input$label)
  6. data_input$label <- ordered(data_input$label,levels=c(
  7. "No. of MAGs = 1", "No. of MAGs = 4","No. of MAGs = 10",
  8. "No. of MAGs = 15", "No. of MAGs = 18", "No. of MAGs = 22"
  9. )) # change labeling
  10. # Fig 5B
  11. ggplot(data = data_input, mapping = aes(x=SNV.p, y=pv_val)) +
  12. scale_x_log10(limits=c(0.5,2000),breaks=c(1,10,100,1000))+
  13. scale_y_log10()+
  14. geom_point(alpha = 0.2,na.rm=TRUE)+
  15. geom_hline(yintercept=1e-10, linetype=2, color = "rosybrown", size=1)+
  16. xlab("Number of SNVs") +
  17. ylab("P-value") +
  18. theme_bw()+
  19. theme(axis.text=element_text(size=12),
  20. legend.text=element_text(size=12),
  21. axis.title=element_text(size=14),
  22. strip.text = element_text(size=12),
  23. legend.position="none")+
  24. facet_wrap(~label,ncol=3)

  1. # plot for all possible number of MAGs (Supplementary)
  2. data$label <- paste("No. of MAGs = ",as.character(data$numb), sep="")
  3. data$label <- as.factor(data$label)
  4. data$label <- ordered(data$label,levels=paste("No. of MAGs = ",
  5. as.character(c(1:22)),
  6. sep=""))
  7. ggplot(data = data, mapping = aes(x=SNV.p, y=pv_val)) +
  8. scale_x_log10(limits=c(0.5,2000),breaks=c(1,10,100,1000))+
  9. scale_y_log10()+
  10. geom_point(alpha = 0.2,na.rm=TRUE)+
  11. geom_hline(yintercept=1e-10, linetype=2, color = "rosybrown", size=1)+
  12. xlab("Number of SNVs") +
  13. ylab("P-value") +
  14. theme_bw()+
  15. theme(axis.text.x = element_text(size=12),
  16. axis.text.y = element_text(size=10),
  17. axis.title=element_text(size=14),
  18. strip.text = element_text(size=12),
  19. legend.position="none")+
  20. facet_wrap(~label,ncol=4)

  1. # then plot only for #SNV = 0
  2. data.0 <- data[data$SNV==0,]
  3. # Fig 5C
  4. ggplot(data = data.0, mapping = aes(x=numb, y=pv_val)) +
  5. geom_boxplot(alpha = 0.3,na.rm = TRUE,outlier.size = 3,aes(colour=numb)) +
  6. scale_y_log10()+
  7. #geom_boxplot(alpha = 0,na.rm = TRUE) +
  8. #geom_jitter(alpha = 0.3, aes(colour = numb),na.rm=TRUE,size=0.5)+
  9. geom_hline(yintercept=1e-10, linetype=2, color = "rosybrown", size=1)+
  10. xlab("Number of MAGs") +
  11. ylab("P-value") +
  12. theme_bw()+
  13. theme(axis.text=element_text(size=12),
  14. axis.title=element_text(size=14),
  15. legend.position="none")

Finally, we also studied whether there is a relationship between the
number of MAGs a cluster of some is found in and the gene’s contig
coverage. We obtained coverage data by running get_contig_coverage.py
which utilized the contig databases and, conveniently, an anvi’o script
anvi-export-splits-and-coverages. As a result, we have
All_Sulfurovum_self_to_self_coverage.txt which only includes
self-to-self
coverage.

  1. coverage<-read.table('aux_files/All_Sulfurovum_self_to_self_coverage.txt',sep='\t',header=TRUE)
  2. coverage<-dplyr::arrange(coverage,unique_id)
  3. coverage$number <- as.factor(cluster$num_genomes_gene_cluster_has_hits)
  4. ggplot(data=coverage,mapping=aes(x=number,y=coverage)) +
  5. geom_boxplot(alpha = 0.3,na.rm = TRUE,outlier.size = 3,aes(colour=number)) +
  6. scale_y_log10()+
  7. xlab("Number of MAGs") +
  8. ylab("Average Coverage\nof Contig") +
  9. theme_bw()+
  10. theme(axis.text=element_text(size=12),
  11. axis.title=element_text(size=14),
  12. legend.position="none")