Link to notebook
Link to github repo.
library(tidyverse)
library(readxl)
library(phyloseq)
library(Biostrings)
#library(phangorn)
library(readr)
library(seqinr)
#library(decontam)
library(ape)
library(vegan)
#library(philr)
library(RColorBrewer)
library(microbiome)
#library(DESeq2)
library(compositions);
#library(cowplot)
library(plotly)
library(htmlwidgets)
library(withr)
library(lubridate)
metadata <- read_csv("sample_data.csv")
[36m──[39m [1m[1mColumn specification[1m[22m [36m─────────────────────────────────────────────────────────────────────────────────────────────────────[39m
cols(
SampleID = [31mcol_character()[39m,
`Year.Trawl#` = [31mcol_character()[39m,
Datecode = [32mcol_double()[39m,
Date = [31mcol_character()[39m,
Month = [32mcol_double()[39m,
Year = [32mcol_double()[39m,
Bayside = [31mcol_character()[39m,
Station = [31mcol_character()[39m,
Habitat = [31mcol_character()[39m,
DO = [32mcol_double()[39m,
Salinity = [32mcol_double()[39m,
Temperature = [32mcol_double()[39m
)
Import count table and taxonomy file. I slightly modified otutable.csv in Excel to otutable_mod.csv to remove the quotes around seq names and put NA placehoder as first col name (which was above row names)
# Import Count table. Skip first row of tsv file, which is just some text
count_table <- read_table2("results/otutable_mod.csv")
Missing column names filled in: 'X1' [1]
[36m──[39m [1m[1mColumn specification[1m[22m [36m─────────────────────────────────────────────────────────────────────────────────────────────────────[39m
cols(
.default = col_double(),
X1 = [31mcol_character()[39m
)
[36mℹ[39m Use [38;5;235m[48;5;253m[38;5;235m[48;5;253m`spec()`[48;5;253m[38;5;235m[49m[39m for the full column specifications.
colnames(count_table)[1] <- "SampleID"
# Import taxonomy of ASVs
taxonomy <- read_csv(file="results/tax_sequences_blast_taxonomy.csv")
Missing column names filled in: 'X1' [1]Duplicated column names deduplicated: 'RefSeq_Tax_ID' => 'RefSeq_Tax_ID_1' [18]
[36m──[39m [1m[1mColumn specification[1m[22m [36m─────────────────────────────────────────────────────────────────────────────────────────────────────[39m
cols(
X1 = [32mcol_double()[39m,
ASV_ID = [31mcol_character()[39m,
ref_seq_ID = [31mcol_character()[39m,
PID = [32mcol_double()[39m,
alnmt_len = [32mcol_double()[39m,
mismatch = [32mcol_double()[39m,
eval = [32mcol_double()[39m,
bscore = [32mcol_double()[39m,
RefSeq_Tax_ID = [32mcol_double()[39m,
Ref_Seq_title = [31mcol_character()[39m,
superkingdom = [31mcol_character()[39m,
phylum = [31mcol_character()[39m,
class = [31mcol_character()[39m,
order = [31mcol_character()[39m,
family = [31mcol_character()[39m,
genus = [31mcol_character()[39m,
species = [31mcol_character()[39m,
RefSeq_Tax_ID_1 = [32mcol_double()[39m
)
# remove first col of sequential numbers
taxonomy[,1] <- NULL
# filter out sequences with low PID (recommended by Sara)
taxonomy <- filter(taxonomy, PID > 92)
# remove BLAST metadata and just retain taxonomy (necessary for further processing below)
drop.cols <- c(colnames(taxonomy)[2:9],'RefSeq_Tax_ID_1')
taxonomy <- select(taxonomy, -one_of(drop.cols))
# And import the Common names, as curated by Sara. Join to taxonomy
commonnames <- read_excel("Trawls MASTER 2020 _mod_ES.xlsx",7)
commonnames
taxonomy <- left_join(taxonomy, commonnames, by = "ASV_ID")
taxonomy
NA
Filtering removed seqs 110, 332 (Gobiosoma ginsburgi and Belone belone) Note for Sara should we consider setting this at 97% which is more robust and still leaves 334 unique ASVs (rather than 379 with the 92% cutoff in the settings above)
Preview datasets
count_table
taxonomy
metadata
I want to use the phyloseq package for some plotting/ statistics, which first requires making phyloseq objects out of each of input data tables-
count_table_matrix <- as.matrix(count_table[,2:392]) # convert count table to matrix, leaving out character column of sample ID
rownames(count_table_matrix) <- count_table$SampleID # add back in Sample IDs as row names
ASV = otu_table(count_table_matrix, taxa_are_rows = FALSE)
taxonomy_matrix <- as.matrix(taxonomy[,2:9])
rownames(taxonomy_matrix) <- taxonomy$ASV_ID
TAX = tax_table(taxonomy_matrix)
# select only the metada rows with eDNA samples
metadata_edna <- metadata %>% filter(!is.na(SampleID))
META = sample_data(data.frame(metadata_edna, row.names = metadata_edna$`SampleID`))
First check that the inputs are in compatible formats by checking for ASV names with the phyloseq function, taxa_names
head(taxa_names(TAX))
[1] "Seq_1" "Seq_2" "Seq_3" "Seq_4" "Seq_5" "Seq_6"
head(taxa_names(ASV))
[1] "Seq_1" "Seq_2" "Seq_3" "Seq_4" "Seq_5" "Seq_6"
And check sample names were also detected
# Modify taxa names in ASV, which are formatted with the sample ID, underscor, fastq ID. Don't need this fastq ID anymore and want it to match the sample names from metadata
sample_names(ASV) <- sample_names(ASV) %>%
str_replace_all(pattern = "_S[:digit:]+",replacement = "")
head(sample_names(ASV))
[1] "T1PosCon" "T1S10" "T1S11" "T1S1" "T1S2" "T1S3"
head(sample_names(META))
[1] "T1PosCon" "T1S1" "T1S2" "T1S3" "T1S5" "T1S6"
And make the phyloseq object
ps <- phyloseq(ASV, TAX, META)
rarecurve(otu_table(ps), step=50, cex=0.5)
empty rows removed
# save as .eps
setEPS()
postscript("Figures/rarefaction.eps")
rarecurve(otu_table(ps), step=50, cex=0.5)
empty rows removed
dev.off()
quartz_off_screen
2
Most samples look like they were sampled to completion. Be weary of T3S11, T1S2, and maybe T4S5
Check some features of the phyloseq object
rank_names(ps)
[1] "superkingdom" "phylum" "class" "order" "family" "genus" "species" "CommonName"
unique(tax_table(ps)[, "superkingdom"])
Taxonomy Table: [2 taxa by 1 taxonomic ranks]:
superkingdom
Seq_1 "Eukaryota"
Seq_377 NA
unique(tax_table(ps)[, "phylum"])
Taxonomy Table: [3 taxa by 1 taxonomic ranks]:
phylum
Seq_1 "Chordata"
Seq_368 "Arthropoda"
Seq_377 NA
unique(tax_table(ps)[, "class"])
Taxonomy Table: [5 taxa by 1 taxonomic ranks]:
class
Seq_1 "Actinopteri"
Seq_63 "Mammalia"
Seq_362 "Chondrichthyes"
Seq_368 "Insecta"
Seq_377 NA
There are some ASVs with NA
as superkingdom, phylum, or class annotation- delete these.
ps <- subset_taxa(ps, !is.na(superkingdom) & !is.na(phylum) & !is.na(class))
unique(tax_table(ps)[, "superkingdom"])
Taxonomy Table: [1 taxa by 1 taxonomic ranks]:
superkingdom
Seq_1 "Eukaryota"
unique(tax_table(ps)[, "phylum"])
Taxonomy Table: [2 taxa by 1 taxonomic ranks]:
phylum
Seq_1 "Chordata"
Seq_368 "Arthropoda"
unique(tax_table(ps)[, "class"])
Taxonomy Table: [4 taxa by 1 taxonomic ranks]:
class
Seq_1 "Actinopteri"
Seq_63 "Mammalia"
Seq_362 "Chondrichthyes"
Seq_368 "Insecta"
nrow(tax_table(ps)) # number of ASVs left
[1] 378
378 ASVs still remain…
Also check class Mammalia, to see if they are contamination or real:
tax_table(subset_taxa(ps, class == 'Mammalia'))
Taxonomy Table: [8 taxa by 8 taxonomic ranks]:
superkingdom phylum class order family genus species CommonName
Seq_63 "Eukaryota" "Chordata" "Mammalia" "Primates" "Hominidae" "Homo" "Homo sapiens" "Human"
Seq_88 "Eukaryota" "Chordata" "Mammalia" "Artiodactyla" "Suidae" "Sus" "Sus scrofa" "Wild boar"
Seq_157 "Eukaryota" "Chordata" "Mammalia" "Primates" "Hominidae" "Homo" "Homo sapiens" "Human"
Seq_343 "Eukaryota" "Chordata" "Mammalia" "Carnivora" "Felidae" "Felis" "Felis catus" "Cat"
Seq_369 "Eukaryota" "Chordata" "Mammalia" "Artiodactyla" "Bovidae" "Bos" "Bos taurus" "Cattle"
Seq_378 "Eukaryota" "Chordata" "Mammalia" "Primates" "Hominidae" "Homo" "Homo sapiens" "Human"
Seq_383 "Eukaryota" "Chordata" "Mammalia" "Primates" "Hominidae" "Homo" "Homo sapiens" "Human"
Seq_389 "Eukaryota" "Chordata" "Mammalia" "Primates" "Hominidae" "Homo" "Homo sapiens" "Human"
These are human, wild boar, cat (ahem…cat lady), and cattle. All are contamination so delete all Mammalia
ps <- subset_taxa(ps, !class == 'Mammalia')
unique(tax_table(ps)[, "class"])
Taxonomy Table: [3 taxa by 1 taxonomic ranks]:
class
Seq_1 "Actinopteri"
Seq_362 "Chondrichthyes"
Seq_368 "Insecta"
Next check the “Insecta” entries
tax_table(subset_taxa(ps, class == 'Insecta'))
Taxonomy Table: [2 taxa by 8 taxonomic ranks]:
superkingdom phylum class order family genus species CommonName
Seq_368 "Eukaryota" "Arthropoda" "Insecta" "Hymenoptera" "Formicidae" "Linepithema" "Linepithema humile" "Ant"
Seq_380 "Eukaryota" "Arthropoda" "Insecta" "Hymenoptera" "Formicidae" "Linepithema" "Linepithema humile" "Ant"
The onlly Insecta is Linepithema humile, which are ants so delete these too..
ps <- subset_taxa(ps, !class == 'Insecta')
unique(tax_table(ps)[, "class"])
Taxonomy Table: [2 taxa by 1 taxonomic ranks]:
class
Seq_1 "Actinopteri"
Seq_362 "Chondrichthyes"
Check overall how many ASVs there are per sample
# First aglomerate the ASVs at the phylum level using the phyloseq function, tax_glom
superkingdomGlommed = tax_glom(ps, "superkingdom")
# and plot
plot_bar(superkingdomGlommed, x = "Sample")
ggsave(filename = "Figures/seqdepth.eps", plot = plot_bar(superkingdomGlommed, x = "Sample"), units = c("in"), width = 9, height = 6, dpi = 300, )# and save
Total sequences reveals certain samples had very low sequencing effort: T1S7, T1S8, T3S11, and, not as bad, T1S2 and T4S5
The rarefaction analysis also showed T1S2 and T4S5 samples were likely not sequenced to completion. Therefore remove these 5 samples from analysis
ps <- subset_samples(ps, !SampleID == "T1S7" & !SampleID == "T1S8" & !SampleID == "T3S11" & !SampleID == "T1S2" & !SampleID == "T4S5")
ps
phyloseq-class experiment-level object
otu_table() OTU Table: [ 368 taxa and 50 samples ]
sample_data() Sample Data: [ 50 samples by 12 sample variables ]
tax_table() Taxonomy Table: [ 368 taxa by 8 taxonomic ranks ]
50 samples remaining with 368 ASVs
Remove Pos Controls (all hits in positive controls are the same family- I assume this is expected)
ps <- subset_samples(ps, !SampleID == "T1PosCon" & !SampleID == "T2PosCon" & !SampleID == "T3PosCon")
ps
phyloseq-class experiment-level object
otu_table() OTU Table: [ 368 taxa and 47 samples ]
sample_data() Sample Data: [ 47 samples by 12 sample variables ]
tax_table() Taxonomy Table: [ 368 taxa by 8 taxonomic ranks ]
47 samples remaining with 368 unique ASVs
And lastly, correct some taxonomy: **First* according to Sara, Engraulis encrasicolus (European anchovy) and Engraulis mordax should be Anchoa mitchilli (Bay anchovy):
tax_table(ps) <- gsub(tax_table(ps), pattern = "Engraulis encrasicolus", replacement = "Anchoa mitchilli")
tax_table(ps) <- gsub(tax_table(ps), pattern = "Engraulis mordax", replacement = "Anchoa mitchilli")
Second the Fourhorn sculpin (Myoxocephalus quadricornis) is actually an Arctic species. This ASV has 100% PID and 100% query cover to Myoxocephalus quadricornis & Myoxocephalus scorpius (another Arctic species) and 99.4% PID, 100% query cover to Myoxocephalus aenaeus. This latter one is actually the regional species, so this is more likely to be the identity:
tax_table(ps) <- gsub(tax_table(ps), pattern = "Myoxocephalus quadricornis", replacement = "Myoxocephalus aenaeus")
tax_table(ps) <- gsub(tax_table(ps), pattern = "Fourhorn sculpin", replacement = "Grubby sculpin")
Third Scomber japonicus, the chub mackerel, is only found in the Indo-Pacific. While this is a commercial product and could be here due to sewage, it is more likely the Scomber colias (Atlantic chub mackerel), which is found regionally (in the open ocean Atlantic). The blast hit to Scomber japonicus has PID of 100% and query cover of 100% while the similarity to Scomber colias 100% query cover/ 99.41% PID.
tax_table(ps) <- gsub(tax_table(ps), pattern = "Scomber japonicus", replacement = "Scomber colias")
tax_table(ps) <- gsub(tax_table(ps), pattern = "Chub mackerel", replacement = "Atlantic chub mackerel")
ps
phyloseq-class experiment-level object
otu_table() OTU Table: [ 368 taxa and 47 samples ]
sample_data() Sample Data: [ 47 samples by 12 sample variables ]
tax_table() Taxonomy Table: [ 368 taxa by 8 taxonomic ranks ]
47 samples remainwith 368 unique ASVs
For plotting, use relative abundances (# of ASV sequences/sum total sequences in sample), calculated easily using microbiome::transform
ps_ra <- microbiome::transform(ps, transform = "compositional")
Export the relative abundance matrix so Sara can have it:
# Extract abundance matrix from the phyloseq object
RelAbun_matrix = as(otu_table(ps_ra), "matrix")
# Coerce to data.frame
RelAbun_dataframe = as.data.frame(RelAbun_matrix)
# Export
write.csv(RelAbun_dataframe,"results/otutable_relabun.csv", row.names = TRUE)
Then aglomerate the ASVs at the family level using the phyloseq function, tax_glom
familyGlommed_RA = tax_glom(ps_ra, "family")
family_barplot <- plot_bar(familyGlommed_RA, x = "Sample", fill = "family")
family_barplot
NOTES
Agglomerate by species to take a look at the unique species
speciesGlommed_RA = tax_glom(ps_ra, "CommonName")
speciesGlommed_RA
phyloseq-class experiment-level object
otu_table() OTU Table: [ 41 taxa and 47 samples ]
sample_data() Sample Data: [ 47 samples by 12 sample variables ]
tax_table() Taxonomy Table: [ 41 taxa by 8 taxonomic ranks ]
tax_table(speciesGlommed_RA)
Taxonomy Table: [41 taxa by 8 taxonomic ranks]:
superkingdom phylum class order family genus
Seq_1 "Eukaryota" "Chordata" "Actinopteri" "Atheriniformes" "Atherinopsidae" "Menidia"
Seq_2 "Eukaryota" "Chordata" "Actinopteri" "Clupeiformes" "Clupeidae" "Brevoortia"
Seq_3 "Eukaryota" "Chordata" "Actinopteri" "Clupeiformes" "Engraulidae" "Engraulis"
Seq_4 "Eukaryota" "Chordata" "Actinopteri" "Scombriformes" "Pomatomidae" "Pomatomus"
Seq_5 "Eukaryota" "Chordata" "Actinopteri" "Lutjaniformes" "Lutjanidae" "Lutjanus"
Seq_6 "Eukaryota" "Chordata" "Actinopteri" "Pleuronectiformes" "Paralichthyidae" "Paralichthys"
Seq_7 "Eukaryota" "Chordata" "Actinopteri" "Clupeiformes" "Clupeidae" "Alosa"
Seq_9 "Eukaryota" "Chordata" "Actinopteri" "Gobiiformes" "Gobiidae" "Gobiosoma"
Seq_10 "Eukaryota" "Chordata" "Actinopteri" "Pleuronectiformes" "Scophthalmidae" "Scophthalmus"
Seq_11 "Eukaryota" "Chordata" "Actinopteri" "Perciformes" "Serranidae" "Centropristis"
Seq_12 "Eukaryota" "Chordata" "Actinopteri" "Spariformes" "Sparidae" "Stenotomus"
Seq_15 "Eukaryota" "Chordata" "Actinopteri" NA "Sciaenidae" "Leiostomus"
Seq_16 "Eukaryota" "Chordata" "Actinopteri" NA "Sciaenidae" "Menticirrhus"
Seq_17 "Eukaryota" "Chordata" "Actinopteri" "Labriformes" "Labridae" "Tautoga"
Seq_19 "Eukaryota" "Chordata" "Actinopteri" "Perciformes" "Cottidae" "Myoxocephalus"
Seq_20 "Eukaryota" "Chordata" "Actinopteri" "Pleuronectiformes" "Pleuronectidae" "Pseudopleuronectes"
Seq_21 "Eukaryota" "Chordata" "Actinopteri" NA "Moronidae" "Morone"
Seq_22 "Eukaryota" "Chordata" "Actinopteri" "Syngnathiformes" "Syngnathidae" "Syngnathus"
Seq_30 "Eukaryota" "Chordata" "Actinopteri" "Pleuronectiformes" "Paralichthyidae" "Etropus"
Seq_33 "Eukaryota" "Chordata" "Actinopteri" NA "Sciaenidae" "Cynoscion"
Seq_34 "Eukaryota" "Chordata" "Actinopteri" "Labriformes" "Labridae" "Tautogolabrus"
Seq_36 "Eukaryota" "Chordata" "Actinopteri" "Anguilliformes" "Anguillidae" "Anguilla"
Seq_38 "Eukaryota" "Chordata" "Actinopteri" "Scombriformes" "Scombridae" "Thunnus"
Seq_40 "Eukaryota" "Chordata" "Actinopteri" "Perciformes" "Gasterosteidae" "Apeltes"
Seq_44 "Eukaryota" "Chordata" "Actinopteri" "Cyprinodontiformes" "Fundulidae" "Fundulus"
Seq_50 "Eukaryota" "Chordata" "Actinopteri" "Atheriniformes" "Atherinopsidae" "Membras"
Seq_52 "Eukaryota" "Chordata" "Actinopteri" "Gadiformes" "Phycidae" "Urophycis"
Seq_54 "Eukaryota" "Chordata" "Actinopteri" "Scombriformes" "Scombridae" "Scomber"
Seq_57 "Eukaryota" "Chordata" "Actinopteri" "Perciformes" "Triglidae" "Prionotus"
Seq_67 "Eukaryota" "Chordata" "Actinopteri" "Scombriformes" "Scombridae" "Thunnus"
Seq_82 "Eukaryota" "Chordata" "Actinopteri" NA "Sciaenidae" "Bairdiella"
Seq_84 "Eukaryota" "Chordata" "Actinopteri" "Gadiformes" "Gadidae" "Microgadus"
Seq_115 "Eukaryota" "Chordata" "Actinopteri" "Cyprinodontiformes" "Fundulidae" "Fundulus"
Seq_119 "Eukaryota" "Chordata" "Actinopteri" "Gadiformes" "Phycidae" "Urophycis"
Seq_139 "Eukaryota" "Chordata" "Actinopteri" "Batrachoidiformes" "Batrachoididae" "Opsanus"
Seq_141 "Eukaryota" "Chordata" "Actinopteri" "Scombriformes" "Scombridae" "Katsuwonus"
Seq_181 "Eukaryota" "Chordata" "Actinopteri" "Tetraodontiformes" "Tetraodontidae" "Sphoeroides"
Seq_231 "Eukaryota" "Chordata" "Actinopteri" "Gadiformes" "Merlucciidae" "Merluccius"
Seq_359 "Eukaryota" "Chordata" "Actinopteri" "Perciformes" "Triglidae" "Prionotus"
Seq_362 "Eukaryota" "Chordata" "Chondrichthyes" "Myliobatiformes" "Myliobatidae" "Rhinoptera"
Seq_372 "Eukaryota" "Chordata" "Chondrichthyes" "Carcharhiniformes" "Triakidae" "Mustelus"
species CommonName
Seq_1 "Menidia menidia" "Atlantic silverside"
Seq_2 "Brevoortia tyrannus" "Atlantic menhaden"
Seq_3 "Anchoa mitchilli" "Bay anchovy"
Seq_4 "Pomatomus saltatrix" "Bluefish"
Seq_5 "Lutjanus griseus" "Grey snapper"
Seq_6 "Paralichthys dentatus" "Summer flounder"
Seq_7 "Alosa mediocris" "Hickory shad"
Seq_9 "Gobiosoma ginsburgi" "Seaboard goby"
Seq_10 "Scophthalmus aquosus" "Windowpane flounder"
Seq_11 "Centropristis striata" "Black seabass"
Seq_12 "Stenotomus chrysops" "Scup"
Seq_15 "Leiostomus xanthurus" "Spot"
Seq_16 "Menticirrhus saxatilis" "Northern kingfish"
Seq_17 "Tautoga onitis" "Tautog"
Seq_19 "Myoxocephalus aenaeus" "Grubby sculpin"
Seq_20 "Pseudopleuronectes americanus" "Winter flounder"
Seq_21 "Morone saxatilis" "Striped bass"
Seq_22 "Syngnathus fuscus" "Northern pipefish"
Seq_30 "Etropus microstomus" "Smallmouth flounder"
Seq_33 "Cynoscion regalis" "Weakfish"
Seq_34 "Tautogolabrus adspersus" "Cunner"
Seq_36 "Anguilla rostrata" "American eel"
Seq_38 "Thunnus obesus" "Bigeye tuna"
Seq_40 "Apeltes quadracus" "Stickleback"
Seq_44 "Fundulus majalis" "Striped killifish"
Seq_50 "Membras martinica" "Rough silverside"
Seq_52 "Urophycis floridana" "Spotted hake"
Seq_54 "Scomber colias" "Atlantic chub mackerel"
Seq_57 "Prionotus carolinus" "Northern searobin"
Seq_67 "Thunnus thynnus" "Atlantic bluefin tuna"
Seq_82 "Bairdiella chrysoura" "American silver perch"
Seq_84 "Microgadus tomcod" "Atlantic tomcod"
Seq_115 "Fundulus heteroclitus" "Mummichog"
Seq_119 "Urophycis floridana" "Red hake"
Seq_139 "Opsanus tau" "Oyster toadfish"
Seq_141 "Katsuwonus pelamis" "Skipjack tuna"
Seq_181 "Sphoeroides maculatus" "Northern puffer"
Seq_231 "Merluccius bilinearis" "Silver hake"
Seq_359 "Prionotus evolans" "Striped searobin"
Seq_362 "Rhinoptera bonasus" "Cownose ray"
Seq_372 "Mustelus canis" "Dusky smooth-hound shark"
Based on my previous scripts with Cariaco Eukaryotic data
# convert ps object to dataframe using phyloseq's psmelt
species_df <- psmelt(speciesGlommed_RA)
# replace zeroes in the table with NA
species_df[species_df == 0] <- NA
# and remove rows with NAs in abundance (this is so they don't appear as small dots in plot)
species_df <- filter(species_df, !is.na(Abundance))
Plot by species, scientific name
speciesbubbleplot_eDNA_sciname <- ggplot(species_df, aes(x = Station, y = fct_rev(species), color = Station)) + # the fancy stuff around y (species) helps to present it in reverse order in the plot (from top to btm alphabetically)
geom_point(aes(size = Abundance, fill = Station), color = "black", pch = 21)+
scale_size(range = c(1,15)) +
scale_size_area(breaks = c(0,.25,.5,.75,1), max_size = 6)+
xlab("")+
ylab("")+
labs(size="Relative Abundance")+
theme_bw() +
scale_fill_brewer(palette="Paired") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
facet_grid(Datecode~Bayside, scales = "free", space = "free", drop= TRUE)
Scale for 'size' is already present. Adding another scale for 'size', which will replace the existing scale.
speciesbubbleplot_eDNA_sciname
Plot by species common name
speciesbubbleplot_eDNA_comname <- ggplot(species_df, aes(x = Station, y = fct_rev(CommonName), color = Station)) + # the fancy stuff around y (CommonName) helps to present it in reverse order in the plot (from top to btm alphabetically)
geom_point(aes(size = Abundance, fill = Station), color = "black", pch = 21)+
scale_size(range = c(1,15)) +
scale_size_area(breaks = c(0,.25,.5,.75,1), max_size = 6)+
xlab("")+
ylab("")+
labs(size="Relative Abundance")+
theme_bw() +
scale_fill_brewer(palette="Paired") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
facet_grid(Datecode~Bayside, scales = "free", space = "free", drop= TRUE)
Scale for 'size' is already present. Adding another scale for 'size', which will replace the existing scale.
speciesbubbleplot_eDNA_comname
Exportfigures
ggsave(filename = "Figures/speciesbubbleplot_eDNA_sciname.eps", plot = speciesbubbleplot_eDNA_sciname, units = c("in"), width = 7, height = 12, dpi = 300)
ggsave(filename = "Figures/speciesbubbleplot_eDNA_comname.eps", plot = speciesbubbleplot_eDNA_comname, units = c("in"), width = 7, height = 12, dpi = 300)
The above look good but they include two elasmobranchs, the dusky smooth-hound shark and cownose ray. While these are probably real, the MiFISH primers don’t actually target the elasmobranchs, so we can’t trust this assay to fairly represent these non-target species. Filter out and re-make the bubble plots:
ps_no_elasmo <- subset_taxa(ps, !CommonName == 'Cownose ray')
ps_no_elasmo <- subset_taxa(ps_no_elasmo, !CommonName =='Dusky smooth-hound shark')
ps_ra_no_elasmo <- subset_taxa(ps_ra, !CommonName == 'Cownose ray')
ps_ra_no_elasmo <- subset_taxa(ps_ra_no_elasmo, !CommonName =='Dusky smooth-hound shark')
# and check
speciesGlommed_RA_no_elasmo = tax_glom(ps_ra_no_elasmo, "CommonName")
speciesGlommed_RA_no_elasmo
phyloseq-class experiment-level object
otu_table() OTU Table: [ 39 taxa and 47 samples ]
sample_data() Sample Data: [ 47 samples by 12 sample variables ]
tax_table() Taxonomy Table: [ 39 taxa by 8 taxonomic ranks ]
tax_table(speciesGlommed_RA_no_elasmo)
Taxonomy Table: [39 taxa by 8 taxonomic ranks]:
superkingdom phylum class order family genus
Seq_1 "Eukaryota" "Chordata" "Actinopteri" "Atheriniformes" "Atherinopsidae" "Menidia"
Seq_2 "Eukaryota" "Chordata" "Actinopteri" "Clupeiformes" "Clupeidae" "Brevoortia"
Seq_3 "Eukaryota" "Chordata" "Actinopteri" "Clupeiformes" "Engraulidae" "Engraulis"
Seq_4 "Eukaryota" "Chordata" "Actinopteri" "Scombriformes" "Pomatomidae" "Pomatomus"
Seq_5 "Eukaryota" "Chordata" "Actinopteri" "Lutjaniformes" "Lutjanidae" "Lutjanus"
Seq_6 "Eukaryota" "Chordata" "Actinopteri" "Pleuronectiformes" "Paralichthyidae" "Paralichthys"
Seq_7 "Eukaryota" "Chordata" "Actinopteri" "Clupeiformes" "Clupeidae" "Alosa"
Seq_9 "Eukaryota" "Chordata" "Actinopteri" "Gobiiformes" "Gobiidae" "Gobiosoma"
Seq_10 "Eukaryota" "Chordata" "Actinopteri" "Pleuronectiformes" "Scophthalmidae" "Scophthalmus"
Seq_11 "Eukaryota" "Chordata" "Actinopteri" "Perciformes" "Serranidae" "Centropristis"
Seq_12 "Eukaryota" "Chordata" "Actinopteri" "Spariformes" "Sparidae" "Stenotomus"
Seq_15 "Eukaryota" "Chordata" "Actinopteri" NA "Sciaenidae" "Leiostomus"
Seq_16 "Eukaryota" "Chordata" "Actinopteri" NA "Sciaenidae" "Menticirrhus"
Seq_17 "Eukaryota" "Chordata" "Actinopteri" "Labriformes" "Labridae" "Tautoga"
Seq_19 "Eukaryota" "Chordata" "Actinopteri" "Perciformes" "Cottidae" "Myoxocephalus"
Seq_20 "Eukaryota" "Chordata" "Actinopteri" "Pleuronectiformes" "Pleuronectidae" "Pseudopleuronectes"
Seq_21 "Eukaryota" "Chordata" "Actinopteri" NA "Moronidae" "Morone"
Seq_22 "Eukaryota" "Chordata" "Actinopteri" "Syngnathiformes" "Syngnathidae" "Syngnathus"
Seq_30 "Eukaryota" "Chordata" "Actinopteri" "Pleuronectiformes" "Paralichthyidae" "Etropus"
Seq_33 "Eukaryota" "Chordata" "Actinopteri" NA "Sciaenidae" "Cynoscion"
Seq_34 "Eukaryota" "Chordata" "Actinopteri" "Labriformes" "Labridae" "Tautogolabrus"
Seq_36 "Eukaryota" "Chordata" "Actinopteri" "Anguilliformes" "Anguillidae" "Anguilla"
Seq_38 "Eukaryota" "Chordata" "Actinopteri" "Scombriformes" "Scombridae" "Thunnus"
Seq_40 "Eukaryota" "Chordata" "Actinopteri" "Perciformes" "Gasterosteidae" "Apeltes"
Seq_44 "Eukaryota" "Chordata" "Actinopteri" "Cyprinodontiformes" "Fundulidae" "Fundulus"
Seq_50 "Eukaryota" "Chordata" "Actinopteri" "Atheriniformes" "Atherinopsidae" "Membras"
Seq_52 "Eukaryota" "Chordata" "Actinopteri" "Gadiformes" "Phycidae" "Urophycis"
Seq_54 "Eukaryota" "Chordata" "Actinopteri" "Scombriformes" "Scombridae" "Scomber"
Seq_57 "Eukaryota" "Chordata" "Actinopteri" "Perciformes" "Triglidae" "Prionotus"
Seq_67 "Eukaryota" "Chordata" "Actinopteri" "Scombriformes" "Scombridae" "Thunnus"
Seq_82 "Eukaryota" "Chordata" "Actinopteri" NA "Sciaenidae" "Bairdiella"
Seq_84 "Eukaryota" "Chordata" "Actinopteri" "Gadiformes" "Gadidae" "Microgadus"
Seq_115 "Eukaryota" "Chordata" "Actinopteri" "Cyprinodontiformes" "Fundulidae" "Fundulus"
Seq_119 "Eukaryota" "Chordata" "Actinopteri" "Gadiformes" "Phycidae" "Urophycis"
Seq_139 "Eukaryota" "Chordata" "Actinopteri" "Batrachoidiformes" "Batrachoididae" "Opsanus"
Seq_141 "Eukaryota" "Chordata" "Actinopteri" "Scombriformes" "Scombridae" "Katsuwonus"
Seq_181 "Eukaryota" "Chordata" "Actinopteri" "Tetraodontiformes" "Tetraodontidae" "Sphoeroides"
Seq_231 "Eukaryota" "Chordata" "Actinopteri" "Gadiformes" "Merlucciidae" "Merluccius"
Seq_359 "Eukaryota" "Chordata" "Actinopteri" "Perciformes" "Triglidae" "Prionotus"
species CommonName
Seq_1 "Menidia menidia" "Atlantic silverside"
Seq_2 "Brevoortia tyrannus" "Atlantic menhaden"
Seq_3 "Anchoa mitchilli" "Bay anchovy"
Seq_4 "Pomatomus saltatrix" "Bluefish"
Seq_5 "Lutjanus griseus" "Grey snapper"
Seq_6 "Paralichthys dentatus" "Summer flounder"
Seq_7 "Alosa mediocris" "Hickory shad"
Seq_9 "Gobiosoma ginsburgi" "Seaboard goby"
Seq_10 "Scophthalmus aquosus" "Windowpane flounder"
Seq_11 "Centropristis striata" "Black seabass"
Seq_12 "Stenotomus chrysops" "Scup"
Seq_15 "Leiostomus xanthurus" "Spot"
Seq_16 "Menticirrhus saxatilis" "Northern kingfish"
Seq_17 "Tautoga onitis" "Tautog"
Seq_19 "Myoxocephalus aenaeus" "Grubby sculpin"
Seq_20 "Pseudopleuronectes americanus" "Winter flounder"
Seq_21 "Morone saxatilis" "Striped bass"
Seq_22 "Syngnathus fuscus" "Northern pipefish"
Seq_30 "Etropus microstomus" "Smallmouth flounder"
Seq_33 "Cynoscion regalis" "Weakfish"
Seq_34 "Tautogolabrus adspersus" "Cunner"
Seq_36 "Anguilla rostrata" "American eel"
Seq_38 "Thunnus obesus" "Bigeye tuna"
Seq_40 "Apeltes quadracus" "Stickleback"
Seq_44 "Fundulus majalis" "Striped killifish"
Seq_50 "Membras martinica" "Rough silverside"
Seq_52 "Urophycis floridana" "Spotted hake"
Seq_54 "Scomber colias" "Atlantic chub mackerel"
Seq_57 "Prionotus carolinus" "Northern searobin"
Seq_67 "Thunnus thynnus" "Atlantic bluefin tuna"
Seq_82 "Bairdiella chrysoura" "American silver perch"
Seq_84 "Microgadus tomcod" "Atlantic tomcod"
Seq_115 "Fundulus heteroclitus" "Mummichog"
Seq_119 "Urophycis floridana" "Red hake"
Seq_139 "Opsanus tau" "Oyster toadfish"
Seq_141 "Katsuwonus pelamis" "Skipjack tuna"
Seq_181 "Sphoeroides maculatus" "Northern puffer"
Seq_231 "Merluccius bilinearis" "Silver hake"
Seq_359 "Prionotus evolans" "Striped searobin"
Remake bubble plots. First melt for tidyverse format
# convert ps object to dataframe using phyloseq's psmelt
species_df_no_elasmo <- psmelt(speciesGlommed_RA_no_elasmo)
# replace zeroes in the table with NA
species_df_no_elasmo[species_df_no_elasmo == 0] <- NA
# and remove rows with NAs in abundance (this is so they don't appear as small dots in plot)
species_df_no_elasmo <- filter(species_df_no_elasmo, !is.na(Abundance))
Plot by species, scientific name
speciesbubbleplot_eDNA_sciname_no_elasmo <- ggplot(species_df_no_elasmo, aes(x = Station, y = fct_rev(species), color = Station)) + # the fancy stuff around y (species) helps to present it in reverse order in the plot (from top to btm alphabetically)
geom_point(aes(size = Abundance, fill = Station), color = "black", pch = 21)+
scale_size(range = c(1,15)) +
scale_size_area(breaks = c(0,.25,.5,.75,1), max_size = 6)+
xlab("")+
ylab("")+
labs(size="Relative Abundance")+
theme_bw() +
scale_fill_brewer(palette="Paired") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
facet_grid(Datecode~Bayside, scales = "free", space = "free", drop= TRUE)
Scale for 'size' is already present. Adding another scale for 'size', which will replace the existing scale.
speciesbubbleplot_eDNA_sciname_no_elasmo
Plot by species common name
speciesbubbleplot_eDNA_comname_no_elasmo <- ggplot(species_df_no_elasmo, aes(x = Station, y = fct_rev(CommonName), color = Station)) + # the fancy stuff around y (CommonName) helps to present it in reverse order in the plot (from top to btm alphabetically)
geom_point(aes(size = Abundance, fill = Station), color = "black", pch = 21)+
scale_size(range = c(1,15)) +
scale_size_area(breaks = c(0,.25,.5,.75,1), max_size = 6)+
xlab("")+
ylab("")+
labs(size="Relative Abundance")+
theme_bw() +
scale_fill_brewer(palette="Paired") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
facet_grid(Datecode~Bayside, scales = "free", space = "free", drop= TRUE)
Scale for 'size' is already present. Adding another scale for 'size', which will replace the existing scale.
speciesbubbleplot_eDNA_comname_no_elasmo
Exportfigures
ggsave(filename = "Figures/speciesbubbleplot_eDNA_sciname_no_elasmo.eps", plot = speciesbubbleplot_eDNA_sciname_no_elasmo, units = c("in"), width = 7, height = 12, dpi = 300)
ggsave(filename = "Figures/speciesbubbleplot_eDNA_comname_no_elasmo.eps", plot = speciesbubbleplot_eDNA_comname_no_elasmo, units = c("in"), width = 7, height = 12, dpi = 300)
# import 4th sheet from Excel file which contains morphometric data for each individual collected for every date
There were 40 warnings (use warnings() to see them)
trawl_master <- read_excel("Trawls MASTER 2020 _mod_ES.xlsx",4)
# and import 6th sheet which is station info
stations <- read_excel("Trawls MASTER 2020 _mod_ES.xlsx",6)
# and import shedding factor- an index determined by Sara that indicates how much the species sheds when handled (and therefore how likely it is to shed cells in water)
sheddingfactor <- read_excel("Allometric correction_mod.xlsx",5)
# Group station name and shedding factor into trawl_master table
trawl_master <- left_join(trawl_master, stations, by = "STATION_NO")
trawl_master <- left_join(trawl_master, sheddingfactor, by = "COMMONNAME")
trawl_master
Import station/ trawl information
station_data <- read_excel("Trawls MASTER 2020 _mod_ES.xlsx",1)
station_data
# Filter to only include DATECODE, Station_NO, Trawl_Min
station_data <- station_data %>% select(DATECODE, STATION_NO, Trawl_Min)
station_data
Combine station information to trawl_master in order to have the duration of each trawl (for calculating CPUE)
trawl_master <- left_join(trawl_master, station_data, by = c("DATECODE", "STATION_NO"))
trawl_master
Make a count table from trawl_master, grouping by date and location, summing the counts for every unique species, and summing the total length for each trawl/ species
trawl_counts <- trawl_master %>%
group_by(DATECODE, STATION_NA, STATION_NO, Trawl_Min, BAYSIDE, CommonName, SheddingFactor) %>%
summarize(TotalLength = sum(TL_CM))
`summarise()` has grouped output by 'DATECODE', 'STATION_NA', 'STATION_NO', 'Trawl_Min', 'BAYSIDE', 'CommonName'. You can override using the `.groups` argument.
counts <- trawl_master %>%
group_by(DATECODE, STATION_NA, STATION_NO, CommonName) %>%
tally(name = "count")
trawl_counts <- left_join(trawl_counts, counts, by = c ("DATECODE", "STATION_NA", "STATION_NO", "CommonName"))
trawl_counts
Calculate CPUE and put in new column. CPUE is the count divided by trawl time (in minutes)
trawl_counts <- trawl_counts %>%
mutate (CPUE = count / Trawl_Min)
trawl_counts
Calculate the metric that Sara came up with: sum(total length) * shedding factor. This is a correction of the abundance that takes into account the sums of length of each fish for each date/trawl and multiplies by a factor determined by how much they shed.
trawl_counts <- trawl_counts %>%
mutate ("TLxSF" = TotalLength*SheddingFactor)
trawl_counts
Then also divide the TotalLength and SumTL*SF by the trawl time in order to account for effort (similar to CPUE)
trawl_counts <- trawl_counts %>%
mutate ("TLPUE" = TotalLength/Trawl_Min)
trawl_counts <- trawl_counts %>%
mutate ("TLxSF.PUE" = TLxSF/Trawl_Min)
trawl_counts
Remove 09/16/20 since there is no equivalent eDNA from that date
trawl_counts <- trawl_counts %>%
filter(DATECODE != "20200916")
speciesbubbleplot_trawl_comname <- ggplot(trawl_counts, aes(x = STATION_NA, y = fct_rev(CommonName), color = STATION_NA)) +
geom_point(aes(size = log10(count), fill = STATION_NA), color = "black", pch = 21)+
scale_size(range = c(1,15)) +
scale_size_area(breaks = c(log10(1), log10(2), log10(5), log10(10), log10(25), log10(100)), max_size = 6, labels = c("1","2","5","10","25","100"))+
xlab("")+
ylab("")+
labs(size="Abundance", fill = "Station")+
theme_bw() +
scale_fill_brewer(palette="Paired") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
facet_grid(DATECODE~BAYSIDE, scales = "free", space = "free", drop= TRUE)
Scale for 'size' is already present. Adding another scale for 'size', which will replace the existing scale.
speciesbubbleplot_trawl_comname
Export figure
ggsave(filename = "Figures/speciesbubbleplot_trawl_abundance_comname.eps", plot = speciesbubbleplot_trawl_comname, units = c("in"), width = 6.75, height = 13, dpi = 300)
speciesbubbleplot_trawl_CPUE_comname <- ggplot(trawl_counts, aes(x = STATION_NA, y = fct_rev(CommonName), color = STATION_NA)) +
geom_point(aes(size = log10(CPUE), fill = STATION_NA), color = "black", pch = 21)+
scale_size(range = c(1,15)) +
scale_size_area(breaks = c(log10(1), log10(2), log10(5), log10(10), log10(25), log10(100)), max_size = 6, labels = c("1","2","5","10","25","100"))+
xlab("")+
ylab("")+
labs(size="CPUE", fill = "Station")+
theme_bw() +
scale_fill_brewer(palette="Paired") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
facet_grid(DATECODE~BAYSIDE, scales = "free", space = "free", drop= TRUE)
Scale for 'size' is already present. Adding another scale for 'size', which will replace the existing scale.
speciesbubbleplot_trawl_CPUE_comname
Looks good! Similar to “counts” figure but some adjustments that normalized for trawling time.
Export figure
ggsave(filename = "Figures/speciesbubbleplot_trawl_CPUE_comname.eps", plot = speciesbubbleplot_trawl_CPUE_comname, units = c("in"), width = 6.75, height = 13, dpi = 300)
First, remove the species from the trawls that are not targeted in the eDNA assay (invertebrates and elasmobranchs)
# import a list of the "OK" species that are targetted by MiFISh primers
mifish_spp <- read_excel("Trawl CPUE no elasmobranch_mod.xlsx",2)
mifish_spp
# filter rows from trawl_counts if the spp name doesn't match the MiFISh list
trawl_counts <- right_join(trawl_counts, mifish_spp, by = "CommonName")
trawl_counts
Then filter out stations from trawl data that were removed samples from eDNA analysis because of poor sequencing effort.
# Grab the eDNA sample IDs that remained after filtering
sampleIDs <- as_data_frame(sample_data(ps_no_elasmo)) %>%
select(SampleID, Datecode, Station)
Setting class(x) to multiple strings ("tbl_df", "tbl", ...); result will no longer be an S4 object
# Filter trawl_counts to only include those same samples
trawl_counts <- inner_join(trawl_counts, sampleIDs, by = c("DATECODE" = "Datecode", "STATION_NA" = "Station"))
trawl_counts
Make abundance table of each species across whole study
# sum hits across all dates in trawl
trawl_uniques <- trawl_counts %>%
group_by(DATECODE, CommonName) %>%
summarise(Trawl_Count = sum(count, na.rm=TRUE), Trawl_CPUE = sum(CPUE, na.rm = TRUE), Trawl_TLPUE = sum(TLPUE, na.rm = TRUE), Trawl_Allometric_Shedding = sum(TLxSF.PUE, na.rm = TRUE))
`summarise()` has grouped output by 'DATECODE'. You can override using the `.groups` argument.
trawl_uniques
# sum hits across all dates in eDNA
eDNA_uniques <- species_df_no_elasmo%>%
group_by(Datecode, CommonName) %>%
summarise(eDNA_RelAbun = sum(Abundance, na.rm=TRUE))
`summarise()` has grouped output by 'Datecode'. You can override using the `.groups` argument.
eDNA_uniques
# Combine into one dataframe
trawl_eDNA_abun_table <- full_join(trawl_uniques, eDNA_uniques, by=c("CommonName" = "CommonName", "DATECODE" = "Datecode"))
trawl_eDNA_abun_table
Count unique species across all stations, grouped by date, for each method, trawl& eDNA (use filtered trawl data so only comparing MiFISh spp to MiFISh spp).
Count total number of species from each method for each date
eDNA_richness <- tally(eDNA_uniques, name = "eDNA")
trawl_richness <- tally(trawl_uniques, name = "trawl")
speciesrichness <- full_join(eDNA_richness, trawl_richness, c("Datecode" = "DATECODE"))
speciesrichness <- pivot_longer(speciesrichness, !Datecode, names_to = "Method", values_to = "Richness")
speciesrichness$Datecode <- ymd(speciesrichness$Datecode) # convert to date format (better for plotting)
speciesrichness
Plot side-by-side
species_richness_plot <- ggplot(speciesrichness, aes(x =Datecode, y = Richness)) +
geom_line(aes(color = Method), size = 3) +
theme_bw() +
xlab("") +
ylab("Species Richness")
species_richness_plot
# export plot
ggsave(filename = "Figures/species_richness_plot.eps", plot = species_richness_plot, units = c("in"), width = 4, height = 3, dpi = 300)
Sum total number of species across all dates/ stations for entire study
species_sums_abun_table <- trawl_eDNA_abun_table %>%
group_by(CommonName) %>%
summarise(CPUE = sum(Trawl_CPUE, na.rm = TRUE),
"Total Length (TL) PUE" = sum(Trawl_TLPUE, na.rm = TRUE),
"TL * Shedding Factor PUE" = sum(Trawl_Allometric_Shedding, na.rm = TRUE),
eDNA = sum(eDNA_RelAbun, na.rm=TRUE)) %>%
pivot_longer(!CommonName, names_to = "Method", values_to = "Abundance")
# turn zeroes to NA so they don't plot
species_sums_abun_table <- na_if(species_sums_abun_table,0)
species_sums_abun_table
For each species, plot side-by-side comparison of abundance (summed over whole study) using each method
# First create a custom color scale to make this pretty
myColors <- colorRampPalette(brewer.pal(11,"Spectral"))(40)
names(myColors) <- levels(unique(species_sums_abun_table$CommonName))
colScale <- scale_colour_manual(name = "CommonName",values = myColors)
species_abun_sum_plot <- ggplot(species_sums_abun_table, aes(x = Abundance, y = reorder(CommonName, Abundance, function(x){sum(x,na.rm = TRUE)}), color = CommonName)) +
geom_point(size = 5) +
facet_wrap(~factor(Method, levels = c('CPUE','Total Length (TL) PUE','TL * Shedding Factor PUE','eDNA')), scales = "free_x", ncol = 4) +
theme_bw() +
xlab("Abundance") +
ylab("") +
colScale +
theme(legend.position = "none")
species_abun_sum_plot
Export plot
ggsave(filename = "Figures/species_abun_sum_plot.eps", plot = species_abun_sum_plot, units = c("in"), width = 10, height = 6, dpi = 300)
I will try PCoA, PCA (the Euclidean PCoA) and NMDS ordinations in combination with different tranformations and distance metrics in order to see which explain the most variance in the dataset.
PCA is essentially a type of PCoA using the Euclidean distance matrix as input. When combined with a log-ratio transformation of the count table, this is deemed appropriate for compositional datasets. It is also recommended as a first step in exploratory analyses of sequencinging datasets.
First do a CLR, centered log ratio transformation of the absolute abundance data (after filtering), as suggested by Gloor et al. 2017
There were 20 warnings (use warnings() to see them)
Generate the PCA and visualize axes
# Generate a Principle Component Analysis (PCA) and evaluated based on the eigen decomposition from sample covariance matrix.
lograt_pca <- prcomp(clr_asv_table_ps)
# NOTE- this is equivalent to first making a Euclidean distance matrix using the CLR data table and then running a PCoA. A Euclidean distance matrix of a log-transformed data table = an Aitchison distance matrix. So this is equivalent to the compositional methods listed in Gloor et al.
# Visual representation with a screeplot
lograt_variances <- as.data.frame(lograt_pca$sdev^2/sum(lograt_pca$sdev^2)) %>% #Extract axes
# Format to plot
select(PercVar = 'lograt_pca$sdev^2/sum(lograt_pca$sdev^2)') %>%
rownames_to_column(var = "PCaxis") %>%
data.frame
head(lograt_variances)
# Plot screeplot
ggplot(lograt_variances, aes(x = as.numeric(PCaxis), y = PercVar)) +
geom_bar(stat = "identity", fill = "grey", color = "black") +
theme_minimal() +
theme(axis.title = element_text(color = "black", face = "bold", size = 10),
axis.text.y = element_text(color = "black", face = "bold"),
axis.text.x = element_blank()) +
labs(x = "PC axis", y = "% Variance", title = "Log-Ratio PCA Screeplot, CLR Tranformation")
Total variance explained by first three axes= 15.8 + 10.7 + 10.1 = 36.6%. Since the second and third axes are similar, plot in 3D with 3 axes.
Visualize the PCA-
# Extract variances from the clr pca
pca_lograt_frame <- data.frame(lograt_pca$x) %>%
rownames_to_column(var = "SampleID")
# Merge metadata into the pcoa data table
pca_lograt_frame <- left_join(pca_lograt_frame, metadata, by = "SampleID")
head(pca_lograt_frame)
# Select eigenvalues from dataframe, round to 4 places and multiply by 100 for plotting. These will be the axes for the 3-D plot
eigenvalues<-round(lograt_variances[,2], digits = 4)*100
# Plotly - 3-D
pca_lograt <- plot_ly(pca_lograt_frame, type='scatter3d', mode='markers',
x=~PC1,y=~PC2,z=~PC3,colors=~brewer.pal(11,'Paired'),
color=~Station, symbols = c('circle','diamond'), symbol=~Bayside)%>%
layout(font=list(size=12),
title='CLR-Euclidean PCA',
scene=list(xaxis=list(title=paste0('Co 2 ',eigenvalues[2],'%'),
showticklabels=FALSE,zerolinecolor='black'),
yaxis=list(title=paste0('Co 3 ',eigenvalues[3],'%'),
showticklabels=FALSE,zerolinecolor='black'),
zaxis=list(title=paste0('Co 1 ',eigenvalues[1],'%'),
showticklabels=FALSE,zerolinecolor='black')))
# pca_lograt
# save in "Embedded_figures" directory so that it can be hosted at Github and embedded in this notebook
withr::with_dir('Embedded_figures', htmlwidgets::saveWidget(as_widget(pca_lograt), file="pca_lograt_eDNA.html", selfcontained = F))
Summary The CLR-Euclidean PCA reveals there is some separation according to East vs West. The first 3 PCs only explain ~36% of the variance so keep going with different ordinations to see if there is a better representation
The more traditional approach to ordinations is to do a PCoA on a distance matrix such as Bray-Curtis, Jaccard, or Unifrac. When combined with a transformation, they become more appropriate for NGS data. One such common transformation is the Hellinger transformation.
The different distance matrices also tell you a few different things about the dataset so I will run try different one to try to see if I can tease those out.
Before calculating any distance matrix, do a transformation of the filtered count table. Hellinger transformation is the square root of the relative abundance, so calculate it based on the ps_ra object:
ps_hellinger <- transform_sample_counts(ps_ra_no_elasmo, function(x){sqrt(x)})
First, Jaccard, which builds the distance matrix based on presence/absence between samples. It does not take into account relative abundance of the taxa. Therefore this functions well for determining differences driven by rare taxa, which are weighed the same as abundant taxa.
jac_dmat<-vegdist(otu_table(ps_hellinger),method="jaccard") # Jaccard dist metric
pcoa_jac<-ape::pcoa(jac_dmat) # perform PCoA
# Extract variances from pcoa, from jaccard calculated dist. metric
jac_variances <- data.frame(pcoa_jac$values$Relative_eig) %>%
select(PercVar = 'pcoa_jac.values.Relative_eig') %>%
rownames_to_column(var = "PCaxis") %>%
data.frame
head(jac_variances)
# Make a screeplot
ggplot(jac_variances, aes(x = as.numeric(PCaxis), y = PercVar)) +
geom_bar(stat = "identity", fill = "grey", color = "black") +
theme_minimal() +
theme(axis.title = element_text(color = "black", face = "bold", size = 10),
axis.text.y = element_text(color = "black", face = "bold"),
axis.text.x = element_blank()) +
labs(x = "PC axis", y = "% Variance", title = "Jaccard PCoA Screeplot")
The first two axes (19.0 + 9.7 = 28.7) are OK. But plot the first 3 axes since the 2nd and 3rd explain a similar amount of variance, (19.0 + 9.7 + 8.4 = 37.1% total variance explained)
Plot in 3D with Plotly
# Extract variances from the jaccard pcoa
pcoa_jac_df <- data.frame(pcoa_jac$vectors) %>%
rownames_to_column(var = "SampleID")
# Merge metadata into the pcoa data table
pcoa_jac_df <- left_join(pcoa_jac_df, metadata, by = "SampleID")
head(pcoa_jac_df)
# Select eigenvalues from dataframe, round to 4 places and multiply by 100 for plotting. These will be the axes for the 3-D plot
eigenvalues<-round(jac_variances[,2], digits = 4)*100
# Plotly - 3-D
pcoa_jaccard <- plot_ly(pcoa_jac_df, type='scatter3d', mode='markers',
x=~Axis.2,y=~Axis.3,z=~Axis.1,colors=~brewer.pal(11,'Paired'),
color=~Station, symbols = c('circle','diamond'), symbol=~Bayside)%>%
layout(font=list(size=12),
title='PCoA Jaccard Distance',
scene=list(xaxis=list(title=paste0('Co 2 ',eigenvalues[2],'%'),
showticklabels=FALSE,zerolinecolor='black'),
yaxis=list(title=paste0('Co 3 ',eigenvalues[3],'%'),
showticklabels=FALSE,zerolinecolor='black'),
zaxis=list(title=paste0('Co 1 ',eigenvalues[1],'%'),
showticklabels=FALSE,zerolinecolor='black')))
# pcoa_jaccard
# save figure in "Embedded_figures" directory so that it can be hosted at Github and embedded in this notebook
withr::with_dir('Embedded_figures', htmlwidgets::saveWidget(as_widget(pcoa_jaccard), file="pcoa_jaccard_eDNA.html", selfcontained = F))
The Jaccard-PCoA shows some separation along axis 2 and axis 3 in East vs West differences. Very similar % variance explained to the PCA.
Next, try a Bray-Curtis distance matrix with PCoA, which builds the distance matrix based on presence/absence between samples and relative abundance differences. This ordination will represent well the differences in samples that are driven by taxa with high relative abundances.
NOTE: I need to use a correction here for negative eigenvalues. Read more here
bray_dmat<-vegdist(otu_table(ps_hellinger),method="bray") # Bray-Curtis dist metric
pcoa_bray<-ape::pcoa(bray_dmat) # perform PCoA in ape. But getting negative eigenvalues, so need to add correction. wcmdscale from base R also performs PCoA and can add cailliez correction
pcoa_bray <- wcmdscale(bray_dmat, eig = TRUE, add = "cailliez")
# check out summary of PCoA
eigenvals(pcoa_bray) %>%
summary() -> ev
ev
Importance of components:
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11]
Eigenvalue 6.3479 3.3005 2.85957 1.62805 1.33439 1.24855 1.00938 0.90346 0.87311 0.77992 0.71218
Proportion Explained 0.2112 0.1098 0.09512 0.05416 0.04439 0.04153 0.03358 0.03005 0.02904 0.02594 0.02369
Cumulative Proportion 0.2112 0.3210 0.41608 0.47023 0.51462 0.55615 0.58973 0.61978 0.64883 0.67477 0.69846
[,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22]
Eigenvalue 0.65613 0.60610 0.54826 0.4989 0.44174 0.40567 0.39186 0.3667 0.34891 0.33706 0.33146
Proportion Explained 0.02183 0.02016 0.01824 0.0166 0.01469 0.01349 0.01304 0.0122 0.01161 0.01121 0.01103
Cumulative Proportion 0.72029 0.74045 0.75869 0.7753 0.78998 0.80347 0.81651 0.8287 0.84031 0.85152 0.86255
[,23] [,24] [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32]
Eigenvalue 0.30199 0.284949 0.268734 0.255852 0.247955 0.239199 0.225418 0.217687 0.198673 0.194406
Proportion Explained 0.01005 0.009479 0.008939 0.008511 0.008248 0.007957 0.007498 0.007241 0.006609 0.006467
Cumulative Proportion 0.87259 0.882071 0.891010 0.899521 0.907769 0.915726 0.923224 0.930465 0.937074 0.943541
[,33] [,34] [,35] [,36] [,37] [,38] [,39] [,40] [,41] [,42]
Eigenvalue 0.186256 0.166417 0.156276 0.152618 0.150887 0.139347 0.133210 0.12775 0.12414 0.110383
Proportion Explained 0.006196 0.005536 0.005198 0.005077 0.005019 0.004635 0.004431 0.00425 0.00413 0.003672
Cumulative Proportion 0.949737 0.955273 0.960471 0.965548 0.970567 0.975202 0.979634 0.98388 0.98801 0.991685
[,43] [,44] [,45]
Eigenvalue 0.106401 0.085699 0.057876
Proportion Explained 0.003539 0.002851 0.001925
Cumulative Proportion 0.995224 0.998075 1.000000
# extract variances and put in tibble
bray_variances <- NULL
for (i in 1:length(eigenvals(pcoa_bray))){
bray_variances[i] <- eigenvals(pcoa_bray)[i]/sum(eigenvals(pcoa_bray))
}
# Extract variances from pcoa, from calculated dist. metric
bray_variances <- tibble(round(bray_variances,3)) %>%
select(PercVar = 'round(bray_variances, 3)') %>%
rownames_to_column(var = "PCaxis") %>%
data.frame
head(bray_variances)
# Make a screeplot
ggplot(bray_variances, aes(x = as.numeric(PCaxis), y = PercVar)) +
geom_bar(stat = "identity", fill = "grey", color = "black") +
theme_minimal() +
theme(axis.title = element_text(color = "black", face = "bold", size = 10),
axis.text.y = element_text(color = "black", face = "bold"),
axis.text.x = element_blank()) +
labs(x = "PC axis", y = "% Variance", title = "Bray-Curtis PCoA Screeplot")
The first two axes (21.1 + 11.0) are pretty good again but I am still going to experiment in the plot with the 3rd axis since it is similar to the second (9.5%; total variance explained = 41.6%)
Plot in 3D with Plotly
# Extract variances from the pcoa
pcoa_bray_df <- data.frame(pcoa_bray$points) %>%
rownames_to_column(var = "SampleID")
# Merge metadata into the pcoa data table
pcoa_bray_df <- left_join(pcoa_bray_df, metadata, by = "SampleID")
head(pcoa_bray_df)
# Select eigenvalues from dataframe, round to 4 places and multiply by 100 for plotting. These will be the axes for the 3-D plot
eigenvalues<-round(bray_variances[,2], digits = 4)*100
# Plotly - 3-D
pcoa_bray <- plot_ly(pcoa_bray_df, type='scatter3d', mode='markers',
x=~Dim2, y=~Dim3, z=~Dim1, colors=~brewer.pal(11,'Paired'),
color=~Station, symbols = c('circle','diamond'), symbol=~Bayside)%>%
layout(font=list(size=12),
title='PCoA Bray-Curtis Distance',
scene=list(xaxis=list(title=paste0('Co 2 ',eigenvalues[2],'%'),
showticklabels=FALSE,zerolinecolor='black'),
yaxis=list(title=paste0('Co 3 ',eigenvalues[3],'%'),
showticklabels=FALSE,zerolinecolor='black'),
zaxis=list(title=paste0('Co 1 ',eigenvalues[1],'%'),
showticklabels=FALSE,zerolinecolor='black')))
# pcoa_bray
# save in "Embedded_figures" directory so that it can be hosted at Github and embedded in this notebook
withr::with_dir('Embedded_figures', htmlwidgets::saveWidget(as_widget(pcoa_bray), file="pcoa_bray_eDNA.html", selfcontained = F))
These results along axes 1, 2, and 3 are similar to Jaccard, but there is more separation along axis 2, indicating that incorporating the differences in abundance helps explain more variance in the dataset. Total variance explained is highest so far.
Lastly, try a non-metric dimensional scaling ordination. PCA/PCoA are metric and attempt to rotate axes to fit the distance matrix distribution. An NMDS represents the data in 2-axes, by constraining the distribution of the points. Similar to above, this can be combined with different pre-treatment of the data.
First try the compositional approach, an NMDS on CLR-tranformed data using the Euclidean distances (aka Aitchison distance)
euc_dmat<-dist(clr_asv_table_ps, method = "euclidean") # Build the Aitchison distance matrix
euc_nmds <- metaMDS(euc_dmat, k=2, autotransform=FALSE) # Run the ordination
Run 0 stress 0.2095936
Run 1 stress 0.230812
Run 2 stress 0.2367264
Run 3 stress 0.2096473
... Procrustes: rmse 0.00967777 max resid 0.04766081
Run 4 stress 0.2099595
... Procrustes: rmse 0.01924625 max resid 0.05406619
Run 5 stress 0.2365027
Run 6 stress 0.2204494
Run 7 stress 0.2096616
... Procrustes: rmse 0.02491728 max resid 0.09196782
Run 8 stress 0.2359077
Run 9 stress 0.2112015
Run 10 stress 0.2094389
... New best solution
... Procrustes: rmse 0.01324573 max resid 0.05622952
Run 11 stress 0.2152036
Run 12 stress 0.2307547
Run 13 stress 0.2143453
Run 14 stress 0.2108001
Run 15 stress 0.222541
Run 16 stress 0.2138032
Run 17 stress 0.2216532
Run 18 stress 0.2228354
Run 19 stress 0.211776
Run 20 stress 0.212668
*** No convergence -- monoMDS stopping criteria:
1: no. of iterations >= maxit
19: stress ratio > sratmax
euc_nmds$stress #Check the stress. Less than 0.1 is good. Less than 0.05 is better. This will be different each time, since it is iteratively finding a unique solution each time (although the should look similar)
[1] 0.2094389
# Extract points from nmds and merge into data frame with metadata
euc_nmds_df <- data.frame(euc_nmds$points) %>%
rownames_to_column(var = "SampleID")
# Merge metadata into the pcoa data table
euc_nmds_df <- left_join(euc_nmds_df, metadata, by = "SampleID")
head(euc_nmds_df)
## Plotting euclidean distance NMDS
nmds_aitch <- ggplot(euc_nmds_df,aes(x = MDS1, y = MDS2, color = Station, shape = Bayside)) +
geom_point(size = 4) +
scale_color_brewer(palette="Paired") +
theme_bw() +
labs(x = "NMDS 1", y = "NMDS 2", title = paste0('Aitchison Distance NMDS, Stress = ', round(euc_nmds$stress,2))) +
coord_fixed(ratio = 1)
nmds_aitch
ggsave("figures/nmds_aitch_eDNA.eps",nmds_aitch, width = 7, height = 5, units = c("in"))
The above has a relatively high stress (>0.2) so should be interpreted with caution. But it does show some separation East vs West along NMDS 1.
Next try a Jaccard NMDS, which will represent differences in presence/absence among samples, emphasizing both abundant and rare taxa the same
jac_nmds <- metaMDS(jac_dmat, k=2, autotransform=FALSE) # Run the ordination. Distance matrix was already calculated above
Run 0 stress 0.1625677
Run 1 stress 0.1849289
Run 2 stress 0.1740042
Run 3 stress 0.1907044
Run 4 stress 0.1511937
... New best solution
... Procrustes: rmse 0.08884194 max resid 0.3232323
Run 5 stress 0.1495056
... New best solution
... Procrustes: rmse 0.05343512 max resid 0.3128283
Run 6 stress 0.1744721
Run 7 stress 0.1779444
Run 8 stress 0.1496721
... Procrustes: rmse 0.05094231 max resid 0.3280469
Run 9 stress 0.1699861
Run 10 stress 0.1971628
Run 11 stress 0.1496709
... Procrustes: rmse 0.05096378 max resid 0.3286255
Run 12 stress 0.1496706
... Procrustes: rmse 0.05095018 max resid 0.3284939
Run 13 stress 0.1662577
Run 14 stress 0.1573862
Run 15 stress 0.1496463
... Procrustes: rmse 0.01251391 max resid 0.07479854
Run 16 stress 0.1739596
Run 17 stress 0.1496708
... Procrustes: rmse 0.05095859 max resid 0.3285551
Run 18 stress 0.1508378
Run 19 stress 0.1744734
Run 20 stress 0.1496306
... Procrustes: rmse 0.01232568 max resid 0.07476111
*** No convergence -- monoMDS stopping criteria:
20: stress ratio > sratmax
jac_nmds$stress #Check the stress. Less than 0.1 is good. Less than 0.5 is better. This will be different each time, since it is iteratively finding a unique solution each time (although the should look similar)
[1] 0.1495056
# Extract points from nmds and merge into data frame with metadata
jac_nmds_df <- data.frame(jac_nmds$points) %>%
rownames_to_column(var = "SampleID")
# Merge metadata into the pcoa data table
jac_nmds_df <- left_join(jac_nmds_df, metadata, by = "SampleID")
head(jac_nmds_df)
## Plotting euclidean distance NMDS
nmds_jaccard <- ggplot(jac_nmds_df,aes(x = MDS1, y = MDS2, color = Station, shape = Bayside)) +
geom_point(size = 4) +
scale_color_brewer(palette="Paired") +
theme_bw() +
labs(x = "NMDS 1", y = "NMDS 2", title = paste0('Jaccard Distance NMDS, Stress = ', round(jac_nmds$stress,2))) +
coord_fixed(ratio = 1)
nmds_jaccard
ggsave("figures/nmds_jaccard_eDNA.eps",nmds_jaccard, width = 7, height = 5, units = c("in"))
This is still a moderately high stress (>0.1) so should be interpreted with caution. Similar to Aitchison-distance nMDS but there is a little more separation of East vs West on NMDS 2 axis.
Next try a Bray-Curis NMDS, which will represent differences in presence/absence among samples and relative abundance, thus emphasizing impacts of highly abundant taxa.
bray_nmds <- metaMDS(bray_dmat, k=2, autotransform=FALSE) # Run the ordination. Distance matrix was already calculated above
Run 0 stress 0.1628464
Run 1 stress 0.1671314
Run 2 stress 0.1625677
... New best solution
... Procrustes: rmse 0.02477208 max resid 0.1434767
Run 3 stress 0.1512
... New best solution
... Procrustes: rmse 0.08883111 max resid 0.3233087
Run 4 stress 0.1889244
Run 5 stress 0.1701812
Run 6 stress 0.1573212
Run 7 stress 0.1573862
Run 8 stress 0.1945793
Run 9 stress 0.1540051
Run 10 stress 0.1568754
Run 11 stress 0.1856705
Run 12 stress 0.1832908
Run 13 stress 0.1496461
... New best solution
... Procrustes: rmse 0.05647177 max resid 0.3117366
Run 14 stress 0.1496461
... Procrustes: rmse 0.0001220942 max resid 0.000651973
... Similar to previous best
Run 15 stress 0.1573375
Run 16 stress 0.1660512
Run 17 stress 0.1498107
... Procrustes: rmse 0.05115396 max resid 0.3295766
Run 18 stress 0.1787654
Run 19 stress 0.149506
... New best solution
... Procrustes: rmse 0.01269612 max resid 0.07602275
Run 20 stress 0.1511933
*** No convergence -- monoMDS stopping criteria:
20: stress ratio > sratmax
bray_nmds$stress #Check the stress. Less than 0.1 is good. Less than 0.5 is better. This will be different each time, since it is iteratively finding a unique solution each time (although the should look similar)
[1] 0.149506
# Extract points from nmds and merge into data frame with metadata
bray_nmds_df <- data.frame(bray_nmds$points) %>%
rownames_to_column(var = "SampleID")
# Merge metadata into the pcoa data table
bray_nmds_df <- left_join(bray_nmds_df, metadata, by = "SampleID")
head(bray_nmds_df)
## Plotting euclidean distance NMDS
nmds_bray <- ggplot(bray_nmds_df,aes(x = MDS1, y = MDS2, color = Station, shape = Bayside)) +
geom_point(size = 4) +
scale_color_brewer(palette="Paired") +
theme_bw() +
labs(x = "NMDS 1", y = "NMDS 2", title = paste0('Bray-Curtis Distance NMDS, Stress = ', round(bray_nmds$stress,2))) +
coord_fixed(ratio = 1)
nmds_bray
ggsave("figures/nmds_bray_eDNA.eps",nmds_bray, width = 7, height = 5, units = c("in"))
Very similar to Jaccard results. Moderately high stress (0.15)
The ordination that explained the most variance in the eDNA dataset was the PCoA using the Bray-Curtis dissimilarity matrix after Hellinger transformation. This is similar to the approach presented in Lacoursière‐Roussel et al. 2018. Use this representation going forward.
Recreate, in 2D, the first two axes of the ordination (PCoA with Bray distance matrx/ Hellinger transformation) and use envfit
from vegan to test and fit environmental variables.
If not making 3D plots, can do this directly in phyloseq ( example ). But phyloseq doesn’t allow for calliez correction of PCoA, so instead use example from G. Simpson to fit envfit on top of output from wcmdscale (PCoA in vegan).
Prepare the ordination variables
pcoa_bray <- wcmdscale(bray_dmat, eig = TRUE, add = "cailliez")
# trim metadata to remove samples that were removed during QC
metadata_ordinations <- metadata[metadata$SampleID %in% sample_data(ps_hellinger)$SampleID,]
# and remove repetitive metadata variables like Date/ Month/ Year/ Trawl #
metadata_ordinations <- select(metadata_ordinations, -"Year.Trawl#", -Date, -Month, -Year)
# sort metadata in same order as the distance matrix, bray_dmat
metadata_ordinations <- metadata_ordinations %>% arrange(factor(SampleID, levels = rownames(otu_table(ps_hellinger))))
# change the column name "Datecode" to "Date" (better for plotting)
colnames(metadata_ordinations)[2] <- "Date"
# fit environmental factors and save stats output
pcoa_bray_envfit <- envfit(pcoa_bray, metadata_ordinations, permutations = 1000)
capture.output(pcoa_bray_envfit, file = "stats_results/pcoa_bray_envfit_eDNA.txt")
# Signficant variables include Datecode (p = 0.023976), DO (p = 0.000999), Bayside (p = 0.004995), and Station (p = 0.041958)
# Make each of the interesting variables their own ordination variables for plotting (exclude Station. This will be a color variable anyway and it's not interesting)
pcoa_bray_envfit_date <- envfit(pcoa_bray~Date, metadata_ordinations, permutations = 1000)
pcoa_bray_envfit_DO <- envfit(pcoa_bray~DO, metadata_ordinations, permutations = 1000)
pcoa_bray_envfit_Bayside <- envfit(pcoa_bray~Bayside, metadata_ordinations, permutations = 1000)
Plot in 2D
# Convert characters in metadata to factors
metadata_ordinations <- metadata_ordinations %>% mutate_if(sapply(metadata_ordinations, is.character), as.factor)
with(as.data.frame(metadata_ordinations), levels(Station))
[1] "CORMORANT POINT" "DUNE ROAD" "EAST MID BAY" "INLET" "LITTLE POND"
[6] "PINE NECK" "PONQUOGUE BRIDGE" "SHINNECOCK HILLS" "SOUTH GRASS" "WEST MID BAY"
[11] "WEST TIANA"
# Define plot parameters
colvec <- c(brewer.pal(11,'Paired')) # colors of stations
shapevec <- c(19,18) # shapes indicating Bayside
# Set up basic plot
par(xpd = T, mar = par()$mar + c(0,0,0,8)) # leave space to add legend. xpd = T allows legend to be outside of the plot
# Add the site scores
with(metadata_ordinations, plot(scores(pcoa_bray, display = "sites"), col = colvec[Station], pch = shapevec[Bayside], cex = 2, xlab = "Co1 21.1%", ylab = "Co2 11.0%"))
# Add the date vector
plot(pcoa_bray_envfit_date, p.max = 0.1, lwd = 2, col = "black")
# Add the DO vector
plot(pcoa_bray_envfit_DO, p.max = 0.1, lwd = 2, col = "black")
# Add the hulls indicating Bayside
with(metadata_ordinations, ordihull(pcoa_bray, Bayside, lwd = 2, lty = c(3,5), label = FALSE))
# Add legends
with(metadata_ordinations, legend(0.77, 0.6, legend = levels(Station), col = colvec, pch = c(19,18,19,19,19,18,19,19,19,18,18), bty = "n", pt.cex = 2, cex = .8))
legend(0.77, 0.8, c("EAST", "WEST"), col = c("black"), lty = c(3,5), lwd = 2, bty = "n", cex = .8) # Legend for Bayside hull lines- did this manually
# Export using base R/ vegan helpers
setEPS()
postscript("Figures/pcoa_bray_envfit_eDNA.eps", width = 7, height = 5)
par(xpd = T, mar = par()$mar + c(0,0,0,8))
with(metadata_ordinations, plot(scores(pcoa_bray, display = "sites"), col = colvec[Station], pch = shapevec[Bayside], cex = 2, xlab = "Co1 21.1%", ylab = "Co2 11.0%"))
plot(pcoa_bray_envfit_date, p.max = 0.1, lwd = 2, col = "black")
plot(pcoa_bray_envfit_DO, p.max = 0.1, lwd = 2, col = "black")
with(metadata_ordinations, ordihull(pcoa_bray, Bayside, lwd = 2, lty = c(3,5), label = FALSE))
with(metadata_ordinations, legend(0.77, 0.6, legend = levels(Station), col = colvec, pch = c(19,18,19,19,19,18,19,19,19,18,18), bty = "n", pt.cex = 2, cex = .8))
legend(0.77, 0.8, c("EAST", "WEST"), col = c("black"), lty = c(3,5), lwd = 2, bty = "n", cex = .8)
dev.off()
quartz_off_screen
2
Does CPUE data need to be transformed before ordinations?
Conclusion?
Make a table of CPUE, TLPUE, and TLxSF.PUE in the style of OTU tables (samples in rows/ species in columms), of the CPUE data. Trawl_counts has already been filtered so that it only includes those samples and species that are relevant to the eDNA study
CPUE_table <- trawl_counts %>%
as_data_frame()%>%
select(SampleID, CommonName, CPUE) %>%
pivot_wider(names_from = CommonName, values_from = CPUE)
CPUE_table
TLPUE_table <- trawl_counts %>%
as_data_frame()%>%
select(SampleID, CommonName, TLPUE) %>%
pivot_wider(names_from = CommonName, values_from = TLPUE)
TLPUE_table
TLSF.PUE_table <- trawl_counts %>%
as_data_frame()%>%
select(SampleID, CommonName, TLxSF.PUE) %>%
pivot_wider(names_from = CommonName, values_from = TLxSF.PUE)
TLSF.PUE_table
Transformation by log transformation (log(x+1) in order to account for zeroes)
# set NAs to zeroes
CPUE_table[is.na(CPUE_table)] <- 0
TLPUE_table[is.na(TLPUE_table)] <- 0
TLSF.PUE_table[is.na(TLSF.PUE_table)] <- 0
# log transform
CPUE_table_transform <- CPUE_table
CPUE_table_transform[,2:length(CPUE_table)] <- log10(CPUE_table[,2:length(CPUE_table)]+1)
TLPUE_table_transform <- TLPUE_table
TLPUE_table_transform[,2:length(TLPUE_table)] <- log10(TLPUE_table[,2:length(TLPUE_table)]+1)
TLSF.PUE_table_transform <- TLSF.PUE_table
TLSF.PUE_table_transform[,2:length(TLSF.PUE_table)] <- log10(TLSF.PUE_table[,2:length(TLSF.PUE_table)]+1)
CPUE_table_transform
TLPUE_table_transform
TLSF.PUE_table_transform
Center (around mean) and standardize (by SD) the log-transformed CPUE data, similar to the references above.
CPUE_table_transform_cen_st <- CPUE_table_transform
CPUE_table_transform_cen_st[,2:length(CPUE_table_transform)] <- scale(CPUE_table_transform[,2:length(CPUE_table_transform)])
TLPUE_table_transform_cen_st <- TLPUE_table_transform
TLPUE_table_transform_cen_st[,2:length(TLPUE_table_transform)] <- scale(TLPUE_table_transform[,2:length(TLPUE_table_transform)])
TLSF.PUE_table_transform_cen_st <- TLSF.PUE_table_transform
TLSF.PUE_table_transform_cen_st[,2:length(TLSF.PUE_table_transform)] <- scale(TLSF.PUE_table_transform[,2:length(TLSF.PUE_table_transform)])
CPUE_table_transform_cen_st
TLPUE_table_transform_cen_st
TLSF.PUE_table_transform_cen_st
Generate the PCA and visualize axes
# convert to dataframe
CPUE_table_transform_cen_st <- data.frame(CPUE_table_transform_cen_st)
rownames(CPUE_table_transform_cen_st) <- c(CPUE_table_transform_cen_st$SampleID)
CPUE_table_transform_cen_st <- CPUE_table_transform_cen_st[,-1]
# run the PCA
log_transform_pca <- prcomp(CPUE_table_transform_cen_st)
# Visual representation with a screeplot
log_transform_variances <- as.data.frame(log_transform_pca$sdev^2/sum(log_transform_pca$sdev^2)) %>% #Extract axes
# Format to plot
select(PercVar = 'log_transform_pca$sdev^2/sum(log_transform_pca$sdev^2)') %>%
rownames_to_column(var = "PCaxis") %>%
data.frame
head(log_transform_variances)
# Plot screeplot
ggplot(log_transform_variances, aes(x = as.numeric(PCaxis), y = PercVar)) +
geom_bar(stat = "identity", fill = "grey", color = "black") +
theme_minimal() +
theme(axis.title = element_text(color = "black", face = "bold", size = 10),
axis.text.y = element_text(color = "black", face = "bold"),
axis.text.x = element_blank()) +
labs(x = "PC axis", y = "% Variance", title = "Log-Transformed PCA Screeplot, CPUE")
Total variance explained by first 2 axes = 17.5 + 11.3 = 28.8%. Total variance explained by first three axes= 17.5 + 11.3 + 10.6 = 39.4%.
Visualize the PCA in 3D:
# Extract variances from the pca
pca_logtransform_frame <- data.frame(log_transform_pca$x) %>%
rownames_to_column(var = "SampleID")
# Merge metadata into the pcoa data table
pca_logtransform_frame <- left_join(pca_logtransform_frame, metadata, by = "SampleID")
head(pca_logtransform_frame)
# Select eigenvalues from dataframe, round to 4 places and multiply by 100 for plotting. These will be the axes for the 3-D plot
eigenvalues<-round(log_transform_variances[,2], digits = 4)*100
# Plotly - 3-D
pca_log_transform <- plot_ly(pca_logtransform_frame, type='scatter3d', mode='markers',
x=~PC1,y=~PC2,z=~PC3,colors=~brewer.pal(11,'Paired'),
color=~Station, symbols = c('circle','diamond'), symbol=~Bayside)%>%
layout(font=list(size=12),
title='PCA on Log-transformed CPUE',
scene=list(xaxis=list(title=paste0('Co 2 ',eigenvalues[2],'%'),
showticklabels=FALSE,zerolinecolor='black'),
yaxis=list(title=paste0('Co 3 ',eigenvalues[3],'%'),
showticklabels=FALSE,zerolinecolor='black'),
zaxis=list(title=paste0('Co 1 ',eigenvalues[1],'%'),
showticklabels=FALSE,zerolinecolor='black')))
# pca_log_transform
# save in "Embedded_figures" directory so that it can be hosted at Github and embedded in this notebook
withr::with_dir('Embedded_figures', htmlwidgets::saveWidget(as_widget(pca_log_transform), file="pca_log_transform_CPUE.html", selfcontained = F))
Also plot in 2D -
log_transform_frame_2D <- ggplot(pca_logtransform_frame,aes(x = PC1, y = PC2, color = Station, shape = Bayside)) +
geom_point(size = 4) +
scale_color_brewer(palette="Paired") +
theme_bw() +
labs(x = paste0('PC1 ', eigenvalues[1], '%'), y = paste0('PC2 ', eigenvalues[2], '%'), title = "PCA on Log-transformed CPUE") +
coord_fixed(ratio = 1)
log_transform_frame_2D
ggsave("figures/pca_log_transform_2D.eps",log_transform_frame_2D, width = 7, height = 5, units = c("in"))
Summary: The percent variance explained by PCA is OK. Try a PCoA
Generate the PCA and visualize axes
# convert to dataframe
TLPUE_table_transform_cen_st <- data.frame(TLPUE_table_transform_cen_st)
rownames(TLPUE_table_transform_cen_st) <- c(TLPUE_table_transform_cen_st$SampleID)
TLPUE_table_transform_cen_st <- TLPUE_table_transform_cen_st[,-1]
# run the PCA
log_transform_pca <- prcomp(TLPUE_table_transform_cen_st)
# Visual representation with a screeplot
log_transform_variances <- as.data.frame(log_transform_pca$sdev^2/sum(log_transform_pca$sdev^2)) %>% #Extract axes
# Format to plot
select(PercVar = 'log_transform_pca$sdev^2/sum(log_transform_pca$sdev^2)') %>%
rownames_to_column(var = "PCaxis") %>%
data.frame
head(log_transform_variances)
# Plot screeplot
ggplot(log_transform_variances, aes(x = as.numeric(PCaxis), y = PercVar)) +
geom_bar(stat = "identity", fill = "grey", color = "black") +
theme_minimal() +
theme(axis.title = element_text(color = "black", face = "bold", size = 10),
axis.text.y = element_text(color = "black", face = "bold"),
axis.text.x = element_blank()) +
labs(x = "PC axis", y = "% Variance", title = "Log-Transformed PCA Screeplot, TLPUE")
Total variance explained by first 2 axes = 17.4 + 10.8 = 28.8%. Total variance explained by first three axes= 17.4 + 10.8 + 9.6 = 37.8%. Very similar to the PCA on CPUE
Plot in 2D -
# Extract variances from the pca
pca_logtransform_frame <- data.frame(log_transform_pca$x) %>%
rownames_to_column(var = "SampleID")
# Merge metadata into the pcoa data table
pca_logtransform_frame <- left_join(pca_logtransform_frame, metadata, by = "SampleID")
head(pca_logtransform_frame)
# Select eigenvalues from dataframe, round to 4 places and multiply by 100 for plotting. These will be the axes for the 3-D plot
eigenvalues<-round(log_transform_variances[,2], digits = 4)*100
log_transform_frame_2D <- ggplot(pca_logtransform_frame,aes(x = PC1, y = PC2, color = Station, shape = Bayside)) +
geom_point(size = 4) +
scale_color_brewer(palette="Paired") +
theme_bw() +
labs(x = paste0('PC1 ', eigenvalues[1], '%'), y = paste0('PC2 ', eigenvalues[2], '%'), title = "PCA on Log-transformed Total Length PUE") +
coord_fixed(ratio = 1)
log_transform_frame_2D
Summary: This is VERY similar to the distribution of points from the PCA on CPUE. Check the allometric data (total length) times the shedding factor (SF) just to be comprehensive…
Generate the PCA and visualize axes
# run the PCA
There were 50 or more warnings (use warnings() to see the first 50)
log_transform_pca <- prcomp(TLSF.PUE_table_transform_cen_st)
# Visual representation with a screeplot
log_transform_variances <- as.data.frame(log_transform_pca$sdev^2/sum(log_transform_pca$sdev^2)) %>% #Extract axes
# Format to plot
select(PercVar = 'log_transform_pca$sdev^2/sum(log_transform_pca$sdev^2)') %>%
rownames_to_column(var = "PCaxis") %>%
data.frame
head(log_transform_variances)
# Plot screeplot
ggplot(log_transform_variances, aes(x = as.numeric(PCaxis), y = PercVar)) +
geom_bar(stat = "identity", fill = "grey", color = "black") +
theme_minimal() +
theme(axis.title = element_text(color = "black", face = "bold", size = 10),
axis.text.y = element_text(color = "black", face = "bold"),
axis.text.x = element_blank()) +
labs(x = "PC axis", y = "% Variance", title = "Log-Transformed PCA Screeplot, TLPUE")
Total variance explained by first 2 axes = 17.5 + 10.7 = 28.2%. Total variance explained by first three axes= 17.4 + 10.8 + 9.7 = 37.9%. Very similar to the PCA on CPUE and on TLPUE
Plot in 2D -
# Extract variances from the pca
pca_logtransform_frame <- data.frame(log_transform_pca$x) %>%
rownames_to_column(var = "SampleID")
# Merge metadata into the pcoa data table
pca_logtransform_frame <- left_join(pca_logtransform_frame, metadata, by = "SampleID")
head(pca_logtransform_frame)
# Select eigenvalues from dataframe, round to 4 places and multiply by 100 for plotting. These will be the axes for the 3-D plot
eigenvalues<-round(log_transform_variances[,2], digits = 4)*100
log_transform_frame_2D <- ggplot(pca_logtransform_frame,aes(x = PC1, y = PC2, color = Station, shape = Bayside)) +
geom_point(size = 4) +
scale_color_brewer(palette="Paired") +
theme_bw() +
labs(x = paste0('PC1 ', eigenvalues[1], '%'), y = paste0('PC2 ', eigenvalues[2], '%'), title = "PCA on Log-transformed Total Length x Shedding Factor PUE") +
coord_fixed(ratio = 1)
log_transform_frame_2D
Again, this is very similar to the PCA on CPUE and on TLPU. Stick with CPUE for simplicity.
Next, try a PCoA. Use the log-transformed abundance matrix to calculate a distance-matrix using the Bray-Curtis similarity metric. Then use this as input for PCoA
NOTE: Need to use a correction here for negative eigenvalues
# convert to dataframe
CPUE_table_transform <- data.frame(CPUE_table_transform)
rownames(CPUE_table_transform) <- c(CPUE_table$SampleID)
CPUE_table_transform <- CPUE_table_transform[,-1]
# Get Bray Curtis distance matrix from log-transformed CPUE data
bray_dmat<-vegdist(CPUE_table_transform,method="bray")
# the normal PCoA results in negative eigenvalues, so need correction. use wcmdscale and add cailliez correction
pcoa_bray <- wcmdscale(bray_dmat, eig = TRUE, add = "cailliez")
# check out summary of PCoA
eigenvals(pcoa_bray) %>%
summary() -> ev
# extract variances and put in tibble
bray_variances <- NULL
for (i in 1:length(eigenvals(pcoa_bray))){
bray_variances[i] <- eigenvals(pcoa_bray)[i]/sum(eigenvals(pcoa_bray))
}
# Extract variances from pcoa, from calculated dist. metric
bray_variances <- tibble(round(bray_variances,3)) %>%
select(PercVar = 'round(bray_variances, 3)') %>%
rownames_to_column(var = "PCaxis") %>%
data.frame
head(bray_variances)
# Make a screeplot
ggplot(bray_variances, aes(x = as.numeric(PCaxis), y = PercVar)) +
geom_bar(stat = "identity", fill = "grey", color = "black") +
theme_minimal() +
theme(axis.title = element_text(color = "black", face = "bold", size = 10),
axis.text.y = element_text(color = "black", face = "bold"),
axis.text.x = element_blank()) +
labs(x = "PC axis", y = "% Variance", title = "Bray-Curtis PCoA Screeplot")
The first two axes (12.0+10.3+8.9) are not as good as PCA. Total variance explained by first 3 PCs = 31.2%)
Plot in 3D with Plotly
# Extract variances from the pcoa
pcoa_bray_df <- data.frame(pcoa_bray$points) %>%
rownames_to_column(var = "SampleID")
# Merge metadata into the pcoa data table
pcoa_bray_df <- left_join(pcoa_bray_df, metadata, by = "SampleID")
head(pcoa_bray_df)
# Select eigenvalues from dataframe, round to 4 places and multiply by 100 for plotting. These will be the axes for the 3-D plot
eigenvalues<-round(bray_variances[,2], digits = 4)*100
# Plotly - 3-D
pcoa_bray <- plot_ly(pcoa_bray_df, type='scatter3d', mode='markers',
x=~Dim2, y=~Dim3, z=~Dim1, colors=~brewer.pal(11,'Paired'),
color=~Station, symbols = c('circle','diamond'), symbol=~Bayside)%>%
layout(font=list(size=12),
title='PCoA Bray-Curtis Distance on Log-Transformed CPUE',
scene=list(xaxis=list(title=paste0('Co 2 ',eigenvalues[2],'%'),
showticklabels=FALSE,zerolinecolor='black'),
yaxis=list(title=paste0('Co 3 ',eigenvalues[3],'%'),
showticklabels=FALSE,zerolinecolor='black'),
zaxis=list(title=paste0('Co 1 ',eigenvalues[1],'%'),
showticklabels=FALSE,zerolinecolor='black')))
# pcoa_bray
# save in "Embedded_figures" directory so that it can be hosted at Github and embedded in this notebook
withr::with_dir('Embedded_figures', htmlwidgets::saveWidget(as_widget(pcoa_bray), file="pcoa_bray_CPUE.html", selfcontained = F))
Plot in 2D
pcoa_bray_2D <- ggplot(pcoa_bray_df,aes(x = Dim1, y = Dim2, color = Station, shape = Bayside)) +
geom_point(size = 4) +
scale_color_brewer(palette="Paired") +
theme_bw() +
labs(x = paste0('Co 1 ', eigenvalues[1], '%'), y = paste0('Co 2 ', eigenvalues[2], '%'), title = "PCoA on Log-transformed CPUE with Bray-Curtis Dissimilarity") +
coord_fixed(ratio = 1)
pcoa_bray_2D
ggsave("figures/pcoa_bray_CPUE_2D.eps",pcoa_bray_2D, width = 7, height = 5, units = c("in"))
# convert to dataframe
TLPUE_table_transform <- data.frame(TLPUE_table_transform)
rownames(TLPUE_table_transform) <- c(TLPUE_table$SampleID)
TLPUE_table_transform <- TLPUE_table_transform[,-1]
# Get Bray Curtis distance matrix from log-transformed CPUE data
bray_dmat<-vegdist(TLPUE_table_transform,method="bray")
# the normal PCoA results in negative eigenvalues, so need correction. use wcmdscale and add cailliez correction
pcoa_bray <- wcmdscale(bray_dmat, eig = TRUE, add = "cailliez")
# check out summary of PCoA
eigenvals(pcoa_bray) %>%
summary() -> ev
# extract variances and put in tibble
bray_variances <- NULL
for (i in 1:length(eigenvals(pcoa_bray))){
bray_variances[i] <- eigenvals(pcoa_bray)[i]/sum(eigenvals(pcoa_bray))
}
# Extract variances from pcoa, from calculated dist. metric
bray_variances <- tibble(round(bray_variances,3)) %>%
select(PercVar = 'round(bray_variances, 3)') %>%
rownames_to_column(var = "PCaxis") %>%
data.frame
head(bray_variances)
# Make a screeplot
ggplot(bray_variances, aes(x = as.numeric(PCaxis), y = PercVar)) +
geom_bar(stat = "identity", fill = "grey", color = "black") +
theme_minimal() +
theme(axis.title = element_text(color = "black", face = "bold", size = 10),
axis.text.y = element_text(color = "black", face = "bold"),
axis.text.x = element_blank()) +
labs(x = "PC axis", y = "% Variance", title = "Bray-Curtis PCoA Screeplot")
The first two axes (14.7+10.3+9.1) are not as good as PCA. Total variance explained by first 3 PCs = 34.1%)
Plot in 2D
# Extract variances from the pcoa
pcoa_bray_df <- data.frame(pcoa_bray$points) %>%
rownames_to_column(var = "SampleID")
# Merge metadata into the pcoa data table
pcoa_bray_df <- left_join(pcoa_bray_df, metadata, by = "SampleID")
head(pcoa_bray_df)
# Select eigenvalues from dataframe, round to 4 places and multiply by 100 for plotting. These will be the axes for the 3-D plot
eigenvalues<-round(bray_variances[,2], digits = 4)*100
pcoa_bray_2D <- ggplot(pcoa_bray_df,aes(x = Dim1, y = Dim2, color = Station, shape = Bayside)) +
geom_point(size = 4) +
scale_color_brewer(palette="Paired") +
theme_bw() +
labs(x = paste0('Co 1 ', eigenvalues[1], '%'), y = paste0('Co 2 ', eigenvalues[2], '%'), title = "PCoA on Log-transformed Total Length PUE, with Bray-Curtis Dissimilarity") +
coord_fixed(ratio = 1)
pcoa_bray_2D
The above is very similar to the Bray Curtis/ PCoA on CPUE (with Axis 2 flipped).
# Get Bray Curtis distance matrix from log-transformed CPUE data
bray_dmat<-vegdist(TLSF.PUE_table_transform,method="bray")
# the normal PCoA results in negative eigenvalues, so need correction. use wcmdscale and add cailliez correction
pcoa_bray <- wcmdscale(bray_dmat, eig = TRUE, add = "cailliez")
# check out summary of PCoA
eigenvals(pcoa_bray) %>%
summary() -> ev
# extract variances and put in tibble
bray_variances <- NULL
for (i in 1:length(eigenvals(pcoa_bray))){
bray_variances[i] <- eigenvals(pcoa_bray)[i]/sum(eigenvals(pcoa_bray))
}
# Extract variances from pcoa, from calculated dist. metric
bray_variances <- tibble(round(bray_variances,3)) %>%
select(PercVar = 'round(bray_variances, 3)') %>%
rownames_to_column(var = "PCaxis") %>%
data.frame
head(bray_variances)
# Make a screeplot
ggplot(bray_variances, aes(x = as.numeric(PCaxis), y = PercVar)) +
geom_bar(stat = "identity", fill = "grey", color = "black") +
theme_minimal() +
theme(axis.title = element_text(color = "black", face = "bold", size = 10),
axis.text.y = element_text(color = "black", face = "bold"),
axis.text.x = element_blank()) +
labs(x = "PC axis", y = "% Variance", title = "Bray-Curtis PCoA Screeplot")
The first two axes (15.9+10.9+9.5) are not as good as PCA. Total variance explained by first 3 PCs = 36.3%)
Plot in 2D
# Extract variances from the pcoa
pcoa_bray_df <- data.frame(pcoa_bray$points) %>%
rownames_to_column(var = "SampleID")
# Merge metadata into the pcoa data table
pcoa_bray_df <- left_join(pcoa_bray_df, metadata, by = "SampleID")
head(pcoa_bray_df)
# Select eigenvalues from dataframe, round to 4 places and multiply by 100 for plotting. These will be the axes for the 3-D plot
eigenvalues<-round(bray_variances[,2], digits = 4)*100
pcoa_bray_2D <- ggplot(pcoa_bray_df,aes(x = Dim1, y = Dim2, color = Station, shape = Bayside)) +
geom_point(size = 4) +
scale_color_brewer(palette="Paired") +
theme_bw() +
labs(x = paste0('Co 1 ', eigenvalues[1], '%'), y = paste0('Co 2 ', eigenvalues[2], '%'), title = "PCoA on Log-transformed Total Length x Shedding Factor PUE, with Bray-Curtis Dissimilarity") +
coord_fixed(ratio = 1)
pcoa_bray_2D
The above is very similar to the Bray Curtis/ PCoA on CPUE (with Axis 2 flipped) and on TPUE. Just continue with CPUE PCoA for simplicity.
Both the PCA and PCoA explain similar percent variances using the first two axes (28.8% for PCA and 22.3%) for PCoA. Continue with both for the environmental fit for now to see if they lead to different interpretations.
# re-run the PCA
log_transform_pca <- prcomp(CPUE_table_transform_cen_st)
# trim metadata to remove samples that are not in CPUE data table
metadata_ordinations <- metadata[metadata$SampleID %in% rownames(log_transform_pca$x),]
# and remove repetitive metadata variables like Date/ Month/ Year/ Trawl #
metadata_ordinations <- select(metadata_ordinations, -"Year.Trawl#", -Date, -Month, -Year)
# sort metadata in same order as the pca matrix
metadata_ordinations <- metadata_ordinations %>% arrange(factor(SampleID, levels = rownames(log_transform_pca$x)))
# change the column name "Datecode" to "Date" (better for plotting)
colnames(metadata_ordinations)[2] <- "Date"
# fit environmental factors and save stats output
log_transform_pca_envfit <- envfit(log_transform_pca, metadata_ordinations, permutations = 1000)
capture.output(log_transform_pca_envfit, file = "stats_results/log_transform_pca_envfit_CPUE.txt")
## The only signficant variable is Station (p = 0.004995)
# Make an ordination object out of the envfit with sig variable
log_transform_pca_envfit_station <- envfit(log_transform_pca~Station, metadata_ordinations, permutations = 1000)
Plot in 2D. Actually didn’t put any vectors in this plot because the only significant variable, station, is already color-coded and there are 11 stations so vectors would be messy.
# Convert characters in metadata to factors
metadata_ordinations <- metadata_ordinations %>% mutate_if(sapply(metadata_ordinations, is.character), as.factor)
with(as.data.frame(metadata_ordinations), levels(Station))
[1] "CORMORANT POINT" "DUNE ROAD" "EAST MID BAY" "INLET" "LITTLE POND"
[6] "PINE NECK" "PONQUOGUE BRIDGE" "SHINNECOCK HILLS" "SOUTH GRASS" "WEST MID BAY"
[11] "WEST TIANA"
# Define plot parameters
colvec <- c(brewer.pal(11,'Paired')) # colors of stations
shapevec <- c(19,18) # shapes indicating Bayside
# Set up basic plot
par(xpd = T, mar = par()$mar + c(0,0,0,8.5)) # leave space to add legend. xpd = T allows legend to be outside of the plot
# Add the site scores
with(metadata_ordinations, plot(scores(log_transform_pca, display = "sites"), col = colvec[Station], pch = shapevec[Bayside], cex = 2, xlab = "Co1 17.5%", ylab = "Co2 11.3%"))
# Add the hulls indicating Bayside
with(metadata_ordinations, ordihull(log_transform_pca, Bayside, lwd = 2, lty = c(3,5), label = FALSE))
# Add legends
with(metadata_ordinations, legend(1.8, 2, legend = levels(Station), col = colvec, pch = c(19,18,19,19,19,18,19,19,19,18,18), bty = "n", pt.cex = 2, cex = .8))
legend(1.8, 5, c("EAST", "WEST"), col = c("black"), lty = c(3,5), lwd = 2, bty = "n", cex = .8) # Legend for Bayside hull lines- did this manually
# Export using base R/ vegan helpers
setEPS()
postscript("Figures/pca_log_transform_CPUE_envfit.eps", width = 7, height = 5)
par(xpd = T, mar = par()$mar + c(0,0,0,8.5))
with(metadata_ordinations, plot(scores(log_transform_pca, display = "sites"), col = colvec[Station], pch = shapevec[Bayside], cex = 2, xlab = "Co1 17.5%", ylab = "Co2 11.3%"))
with(metadata_ordinations, ordihull(log_transform_pca, Bayside, lwd = 2, lty = c(3,5), label = FALSE))
with(metadata_ordinations, legend(1.8, 2, legend = levels(Station), col = colvec, pch = c(19,18,19,19,19,18,19,19,19,18,18), bty = "n", pt.cex = 2, cex = .8))
legend(1.8, 5, c("EAST", "WEST"), col = c("black"), lty = c(3,5), lwd = 2, bty = "n", cex = .8)
dev.off()
quartz_off_screen
3
There were 32 warnings (use warnings() to see them)
# Make an ordination object out of the envfit with sig variables
pcoa_bray_envfit_Date <- envfit(pcoa_bray~Date, metadata_ordinations, permutations = 1000)
pcoa_bray_envfit_Bayside <- envfit(pcoa_bray~Bayside, metadata_ordinations, permutations = 1000)
pcoa_bray_envfit_station <- envfit(pcoa_bray~Station, metadata_ordinations, permutations = 1000)
Plot in 2D. Actually didn’t put any vectors in this plot because the only significant variable, station, is already color-coded and there are 11 stations so vectors would be messy.
# Convert characters in metadata to factors
metadata_ordinations <- metadata_ordinations %>% mutate_if(sapply(metadata_ordinations, is.character), as.factor)
with(as.data.frame(metadata_ordinations), levels(Station))
[1] "CORMORANT POINT" "DUNE ROAD" "EAST MID BAY" "INLET" "LITTLE POND"
[6] "PINE NECK" "PONQUOGUE BRIDGE" "SHINNECOCK HILLS" "SOUTH GRASS" "WEST MID BAY"
[11] "WEST TIANA"
# Define plot parameters
colvec <- c(brewer.pal(11,'Paired')) # colors of stations
shapevec <- c(19,18) # shapes indicating Bayside
# Set up basic plot
par(xpd = T, mar = par()$mar + c(0,0,0,8)) # leave space to add legend. xpd = T allows legend to be outside of the plot
# Add the site scores
with(metadata_ordinations, plot(scores(pcoa_bray, display = "sites"), col = colvec[Station], pch = shapevec[Bayside], cex = 2, xlab = "Co1 12.0%", ylab = "Co2 10.3%"))
# Add the date vector
plot(pcoa_bray_envfit_Date, p.max = 0.1, lwd = 2, col = "black")
# Add the hulls indicating Bayside
with(metadata_ordinations, ordihull(pcoa_bray, Bayside, lwd = 2, lty = c(3,5), label = FALSE))
# Add legends
with(metadata_ordinations, legend(0.77, 0.3, legend = levels(Station), col = colvec, pch = c(19,18,19,19,19,18,19,19,19,18,18), bty = "n", pt.cex = 2, cex = .8))
legend(0.77, 0.5, c("EAST", "WEST"), col = c("black"), lty = c(3,5), lwd = 2, bty = "n", cex = .8) # Legend for Bayside hull lines- did this manually
# Export using base R/ vegan helpers
setEPS()
postscript("Figures/pcoa_bray_envfit_CPUE.eps", width = 7, height = 5)
par(xpd = T, mar = par()$mar + c(0,0,0,8))
with(metadata_ordinations, plot(scores(pcoa_bray, display = "sites"), col = colvec[Station], pch = shapevec[Bayside], cex = 2, xlab = "Co1 12.0%", ylab = "Co2 10.3%"))
plot(pcoa_bray_envfit_Date, p.max = 0.1, lwd = 2, col = "black")
with(metadata_ordinations, ordihull(pcoa_bray, Bayside, lwd = 2, lty = c(3,5), label = FALSE))
with(metadata_ordinations, legend(0.77, 0.3, legend = levels(Station), col = colvec, pch = c(19,18,19,19,19,18,19,19,19,18,18), bty = "n", pt.cex = 2, cex = .8))
legend(0.77, 0.5, c("EAST", "WEST"), col = c("black"), lty = c(3,5), lwd = 2, bty = "n", cex = .8)
dev.off()
quartz_off_screen
4
For the trawl data, ordinations of the allometric corrections (sum of total length and sum total length x shedding factor) are VERY similar to the results using just CPUE. So, for simplicity, just present the CPUE.
The Bray-Curtis PCoA on the eDNA and trawl CPUE data are similar but not identical. Each ordination reveals that Bayside and Date are important correlated variables. The eDNA ordination was also sensitive to DO, which did not appear in the CPUE ordination. The first two axes of both ordinations explain some, but not a majority of the variance (32.1% for eDNA and 22.3% for CPUE).