Home

Awesome

Using ggtree to Visualize Data on Tree-Like Structure

If you use this work in published research, please cite:

G Yu<sup>*</sup>. Using ggtree to Visualize Data on Tree-Like Structure. Current Protocols in Bioinformatics, 2020, 69(1):e96, https://doi.org/10.1002/cpbi.96.


This repo contains source code and data to reproduce figures of the above paper.

BASIC PROTOCOL 1

library(treeio)
library(ggtree)
treetext = "(((ADH2:0.1[&&NHX:S=human], ADH1:0.11[&&NHX:S=human]):
0.05 [&&NHX:S=primates:D=Y:B=100],ADHY:
0.1[&&NHX:S=nematode],ADHX:0.12 [&&NHX:S=insect]):
0.1[&&NHX:S=metazoa:D=N],(ADH4:0.09[&&NHX:S=yeast],
ADH3:0.13[&&NHX:S=yeast], ADH2:0.12[&&NHX:S=yeast],
ADH1:0.11[&&NHX:S=yeast]):0.1[&&NHX:S=Fungi])[&&NHX:D=N];"
tree <- read.nhx(textConnection(treetext))
p <- ggtree(tree)

p2 <- p + geom_tiplab() + 
  geom_label(aes(x=branch, label=S), fill='lightgreen') + 
  geom_label(aes(label=D), fill='steelblue') + 
  geom_text(aes(label=B), hjust=-.5) +
  xlim(NA, 0.28) 

BASIC PROTOCOL 2

expr <- read.table("mskcc.txt") 
d <- as.dist(1 - cor(expr, method="pearson"))
hc <- hclust(d, "ward.D")

p <- ggtree(hc, linetype='dashed',size=1) + layout_dendrogram() +
   theme_dendrogram() +
   geom_tiplab(angle=90, hjust=1, offset=-.05, show.legend=FALSE) 

clus <- cutree(hc,k=2)
g <- split(names(clus), clus)

p <- groupOTU(p, g, group_name='Cluster') + aes(color=Cluster) + 
  scale_color_manual(breaks=c(1,2), values=c("#2874C5","#EABF00"))

surv <- read.table("mskcc_surv.txt")
surv.df <- data.frame(label = names(clus),
                         OS=surv$OS_STATUS,
                         PFS=surv$DFS_STATUS)

g2 <- p %<+% surv.df + 
  geom_tippoint(shape=21, aes(fill=OS), color='black', size=6) + 
  scale_fill_manual(values=c("LIVING"="#dcddde","DECEASED"="#700353"), 
                    na.value="white") +
  ggnewscale::new_scale_fill() +
  geom_tippoint(shape=21, aes(fill=PFS), color='black', size=3) +
  scale_fill_manual(values=c("DiseaseFree"="#dcddde",
                             "Recurred/Progressed"="#700353")) +
  theme(plot.margin=margin(6,6,30,6), legend.position=c(.5, .6)) 

g3 <- gheatmap(p, surv[, c("OS_STATUS", "DFS_STATUS")], width=.2, 
        colnames_position="top", offset=.3, hjust=0) + 
  scale_fill_manual(breaks=c("LIVING", "DECEASED", 
                            "DiseaseFree", "Recurred/Progressed"),
                    values=c('#700353', '#dcddde', '#dcddde', '#700353'), 
                    na.value="white", name = "Survival")

## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

BASIC PROTOCOL 3

library(ape)
library(ggplot2)
library(tidytree)
library(treeio)
library(ggtree)

data(woodmouse)
d <- dist.dna(woodmouse)
tr <- nj(d)
bp <- boot.phylo(tr, woodmouse, function(x) nj(dist.dna(x)))

## 
Running bootstraps:       100 / 100
## Calculating bootstrap values... done.

bp2 <- tibble(node=1:Nnode(tr) + Ntip(tr), bootstrap = bp)
tree <- full_join(tr, bp2, by="node")

g4 = ggtree(tree, size=1.5) + 
    geom_tiplab(size=4.5, fontface="bold") + xlim(0, 0.02) + 
    geom_nodepoint(aes(fill=cut(bootstrap, c(0, 70, 90, 100))), 
                    shape=21, size=4) + 
    theme_tree(legend.position=c(0.8, 0.2)) + 
    scale_fill_manual(values=c("white", "grey", "black"), guide='legend', 
                    name='Bootstrap Percentage(BP)', 
                    breaks=c('(90,100]', '(70,90]', '(0,70]'), 
                    labels=expression(BP>=90,70 <= BP * " < 90", BP < 70))

BASIC PROTOCOL 4

library(ggimage)
library(ggtree)

tree <- read.tree("tree_boots.nwk")
info <- read.csv("taxa_info.csv")

mass <- info$mass_in_kg
names(mass) <- info$Newick_label
fit <- phytools::fastAnc(tree,mass,vars=TRUE,CI=TRUE)

td <- data.frame(node = nodeid(tree, names(mass)),
                 trait = mass)
nd <- data.frame(node = as.numeric(names(fit$ace)), 
                 trait = fit$ace)
d <- rbind(td, nd)

library(treeio)
tree2 <- full_join(tree, d, by = 'node')

pg <- ggtree(tree2, aes(color=trait), continuous = TRUE, size=3) +
  scale_color_gradientn(colours=c("red", 'orange', 'green', 'cyan', 'blue'), 
                        name="mass (kg)") + 
  geom_tiplab(hjust = -.2) + xlim(0, 4) 

trophic_habit <- setNames(info$trophic_habit, info$Newick_label)

cols <- RColorBrewer::brewer.pal(length(unique(trophic_habit)), "Set1")
names(cols) <- sort(unique(trophic_habit))
fitER <- ape::ace(trophic_habit,tree,model="ER",type="discrete")
ancstats <- as.data.frame(fitER$lik.anc)
ancstats$node <- 1:Nnode(tree)+Ntip(tree)

pies <- nodepie(ancstats, cols=1:3)
pies <- lapply(pies, function(g) g + scale_fill_manual(values = cols))

pg <- pg %<+% info +
  geom_tippoint(aes(fill = trophic_habit), shape=21, size=10, color='white') + 
  scale_fill_manual(values = cols) +
  geom_inset(pies, width = .2, height=.2) 

id <- phylopic_uid(info[,1])

gg <- pg %<+% id + 
  geom_tiplab(aes(image=uid), geom="phylopic", offset=.75, size=.1) 

BASIC PROTOCOL 5

library(ape)
library(Biostrings)

fasfile <- system.file("extdata/sample.fasta", package = "ggmsa")
x <- readAAStringSet(fasfile)

d <- as.dist(stringDist(x, method = "hamming")/width(x)[1])
tree <- bionj(d)

library(ggtree)
library(ggmsa)

p <- ggtree(tree, size=1) + geom_tiplab()

data <- tidy_msa(fasfile, start = 160, end = 220)
p2 <- p + 
  geom_facet(geom = geom_msa, data = data,  
            panel = 'Multiple Sequence Alignment',
            color = "Chemistry_AA") + 
  xlim_tree(3)

pp <- facet_widths(p2, widths=c(.2, 1))