Apis-wings-EU: A workflow for morphometric identification of honey bees from Europe

public 1yr ago Version: Version 1 0 bookmarks

View Workflow

apis-wings-eu-a-workflow-for-morphometric-identifi — View Workflow

We present an R script that describes the workflow for analysing honey bee (Apis mellifera) wing shape. It is based on a large dataset of wing images and landmark coordinates available at Zenodo: https://doi.org/10.5281/zenodo.7244070 . The dataset can be used as a reference for the identification of unknown samples. As unknown samples, we used data from Nawrocka et al. (2018), available at Zenodo: https://doi.org/10.5281/zenodo.7567336 . Among others, the script can be used to identify the geographic origin of unknown samples and therefore assist in the monitoring and conservation of honey bee biodiversity in Europe.

Code Snippets

knitr::opts_chunk$set(
	echo = TRUE,
	message = FALSE,
	warning = FALSE
)

R Markdown From line 9 of raw/Apis-mellifera-wings-EU.rmd

# calculations
library(geomorph) # GPA 
library(shapes) # OPA
library(Morpho) # CVA
library(phangorn) # UPGMA
library(geosphere) # distm

# plotting and visualization
library(ggplot2) # plots
ggplot2::theme_set(theme_light())
library(ggforce) # geom_ellipse
library(mgcViz) # GAM visualization
library(viridis) # plot scale
library(rnaturalearth) # maps

R Markdown ggplot2 viridis ggforce phangorn geomorph geosphere mgcViz Morpho rnaturalearth shapes From line 19 of raw/Apis-mellifera-wings-EU.rmd

wings <- read.csv("https://zenodo.org/record/7244070/files/EU-raw-coordinates.csv", header = TRUE)

# extract sample classifier
wings <- tidyr::separate(
    data = wings,
    col = file,
    sep = c(7), # sample name is in the first 7 characters of file name
    into = c("sample", NA),
    remove = FALSE
  ) 

# extract country classifier
wings <- tidyr::separate(
    data = wings,
    col = sample,
    sep = c(2), # country code is in the first 2 characters of sample name
    into = c("country", NA),
    remove = FALSE
  ) 
head(wings, 2)

R Markdown From line 57 of raw/Apis-mellifera-wings-EU.rmd

geo.data <- read.csv("https://zenodo.org/record/7244070/files/EU-geo-data.csv", header = TRUE)

sample.geo.data <- aggregate(cbind(geo.data$latitude, geo.data$longitude), by = list(geo.data$sample), FUN = mean)
sample.geo.data <- reshape::rename(sample.geo.data, c(Group.1 = "sample", V1 = "latitude", V2 = "longitude")) 

# extract country classifier
sample.geo.data <- tidyr::separate(
    data = sample.geo.data,
    col = sample,
    sep = c(2), # country code is in the first 2 characters of sample name
    into = c("country", NA),
    remove = FALSE
  ) 
head(sample.geo.data, 2)

R Markdown From line 84 of raw/Apis-mellifera-wings-EU.rmd

world <- ne_countries(scale = "medium", returnclass = "sf")

# jitter the coordinates to show all locations
jitter.x <- jitter(sample.geo.data$longitude, amount = 0.2)
jitter.y <- jitter(sample.geo.data$latitude, amount = 0.2)

ggplot(data = world) +
  geom_sf() +
  geom_point(data = sample.geo.data, 
             aes(x = jitter.x, y = jitter.y, colour = country, shape = country), size = 1) +
  scale_color_manual(name ="Country", values = rainbow(13)) +
  scale_shape_manual(name ="Country", values = c(0:2,15:25)) +
  coord_sf(xlim = c(-11, 33), ylim = c(34, 56)) + 
  xlab("Longitude") + ylab("Latitude")

R Markdown ggplot2 From line 103 of raw/Apis-mellifera-wings-EU.rmd

p <- 19  # number of landmarks
k <- 2   # number of dimensions, in this case 2 for coordinates (x, y)

# create coordinates names used by IdentiFly
xyNames <- c("x1", "y1")
for (i in 2:p) {
  xyNames <- c(xyNames, paste0("x", i))
  xyNames <- c(xyNames, paste0("y", i))
}
xyNames

# create coordinates names used by geomorph
xy.Names <- c("1.X", "1.Y")
for (i in 2:p) {
  xy.Names <- c(xy.Names, paste(i, "X", sep = "."))
  xy.Names <- c(xy.Names, paste(i, "Y", sep = "."))
}
xy.Names

# The number of principal components used is 2*p-4 = 34, which is equal to the degrees of freedom
# create principal components names used by prcomp
pcNames <- paste0("PC", 1:(2*p-4))
pcNames

# create principal components names used by geomorph
compNames <- paste0("Comp", 1:(2*p-4))
compNames

geoNames <- c("latitude", "longitude")

R Markdown From line 125 of raw/Apis-mellifera-wings-EU.rmd

# Convert 2D array into a 3D array
wings.coords <- arrayspecs(wings[xyNames], p, k)
dimnames(wings.coords)[[3]] <- wings$file

# Align the coordinates using Generalized Procrustes Analysis 
GPA <- gpagen(wings.coords, print.progress = FALSE)

# Convert 3D array into a 2D array - opposite to arrayspecs
wings.aligned <- two.d.array(GPA$coords) 
head(wings.aligned, 2)

R Markdown From line 162 of raw/Apis-mellifera-wings-EU.rmd

sample.aligned <- aggregate(wings.aligned, by = list(wings$sample), FUN = mean)
names(sample.aligned)[names(sample.aligned) == "Group.1"] <- "sample" # rename column

# extract country code as classifier
sample.aligned <- tidyr::separate(
    data = sample.aligned,
    col = sample,
    sep = c(2),
    into = c("country", NA),
    remove = FALSE
  ) 
head(sample.aligned, 2)

R Markdown From line 177 of raw/Apis-mellifera-wings-EU.rmd

# Convert 2D array into a 3D array
sample.3D <- arrayspecs(sample.aligned[xy.Names], p, k)
dimnames(sample.3D)[[3]] <- sample.aligned$sample

sample.pca <- gm.prcomp(sample.3D)
sample.pca.scores <- as.data.frame(sample.pca$x[ , compNames])
sample.pca.scores <- cbind(sample.geo.data, sample.pca.scores)

# create plot labels
variance.tab <- summary(sample.pca)$PC.summary
variance <- variance.tab["Proportion of Variance", "Comp1"]
variance <- round(100 * variance, 1)
label.x <- paste0("PC1 (", variance, "%)")
variance <- variance.tab["Proportion of Variance", "Comp2"]
variance <- round(100 * variance, 1)
label.y <- paste0("PC2 (", variance, "%)")

ggplot(sample.pca.scores, aes(x = Comp1, y = Comp2, shape = sample.aligned$country, color = sample.aligned$country)) +
  geom_point() +
  scale_shape_manual(name ="Country", values = c(0:2,15:25)) +
  scale_color_manual(name ="Country", values = rainbow(13)) +
  stat_ellipse() + 
  xlab(label.x) + ylab(label.y)

R Markdown ggplot2 From line 194 of raw/Apis-mellifera-wings-EU.rmd

# define which landmarks are connected by lines in wireframe graph
link.x <- c(1, 1, 2, 2, 3,  3, 4,  4,  5, 6, 7,  7,  7, 8,  9,  9, 10, 11, 11, 12, 13, 14, 15, 16, 17)
link.y <- c(2, 3, 4, 5, 6, 19, 6, 10, 12, 8, 8, 14, 19, 9, 10, 15, 11, 12, 16, 13, 18, 15, 16, 17, 18)
links.apis <- cbind(link.x, link.y)

# mark minimum blue and maximum red
GP1 <- gridPar(pt.bg = "blue", link.col = "blue", pt.size = 1, 
               tar.pt.bg ="red", tar.link.col ="red")
# wing shape change along PC1
plotRefToTarget(M1 = sample.pca$shapes$shapes.comp1$min, 
                M2 = sample.pca$shapes$shapes.comp1$max, 
                gridPars=GP1, method = "points", links = links.apis)
# wing shape change along PC2 
plotRefToTarget(M1 = sample.pca$shapes$shapes.comp2$min, 
                M2 = sample.pca$shapes$shapes.comp2$max, 
                gridPars=GP1, method = "points", links = links.apis)

# countries difer significantly in wing shape
country.MANOVA <- manova(as.matrix(sample.pca.scores[ , compNames]) ~ country, data=sample.pca.scores)
summary(country.MANOVA)

R Markdown From line 223 of raw/Apis-mellifera-wings-EU.rmd

# PC1
ggplot(data = world) +
  geom_sf() +
  geom_point(data = sample.pca.scores, 
             aes(x = jitter.x, y = jitter.y, colour = Comp1), size = 1) +
  coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
  scale_color_viridis(name = "PC1") +
  xlab("Longitude") + ylab("Latitude")


GAM <- mgcv::gam(Comp1 ~ s(longitude, latitude), data = sample.pca.scores)
anova(GAM)

viz.GAM <- getViz(GAM)
plot(viz.GAM, 1) +
  l_fitRaster() +
  l_fitContour(color ="grey30") + 
  l_points() +
  labs(title = NULL) +
  scale_fill_continuous(type = "viridis", name = "PC1", na.value = "transparent") +
  scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
  scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
  geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
            color ="black", inherit.aes = FALSE)

# PC2
ggplot(data = world) +
  geom_sf() +
  geom_point(data = sample.pca.scores, 
             aes(x = jitter.x, y = jitter.y, colour = Comp2), size = 1) +
  coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
  scale_color_viridis(name = "PC2") +
  xlab("Longitude") + ylab("Latitude")

GAM <- mgcv::gam(Comp2 ~ s(longitude, latitude), data = sample.pca.scores)
anova(GAM)

viz.GAM <- getViz(GAM)
plot(viz.GAM, 1) +
  l_fitRaster() + scale_fill_continuous(na.value = "transparent") +
  l_fitContour(color ="grey30") + 
  l_points() +
  labs(title = NULL) +
  scale_fill_continuous(type = "viridis", name = "PC2", na.value = "transparent") +
  scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
  scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
  geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
            color ="black", inherit.aes = FALSE)

R Markdown ggplot2 From line 248 of raw/Apis-mellifera-wings-EU.rmd

# Convert 2D array into a 3D array
sample.3D <- arrayspecs(sample.aligned[xy.Names], p, k)
dimnames(sample.3D)[[3]] <- sample.aligned$sample

# use equal prior probability for all groups
n.country <- length(unique(sample.aligned$country)) # number of groups
sample.cva <- CVA(sample.3D, sample.aligned$country, rounds = 10000, cv = TRUE,
                      prior = rep(1/n.country, n.country))
sample.cva.scores <- as.data.frame(sample.cva$CVscores)
# remove unwanted spaces in variable names otherwise use `CV 1`
colnames(sample.cva.scores) <- gsub(" ", "", colnames(sample.cva.scores)) 
sample.cva.scores <- cbind(sample.geo.data, sample.cva.scores)

ggplot(sample.cva.scores, aes(x = CV1, y = CV2, shape = country, color = country)) +
  geom_point() +
  scale_shape_manual(name ="Country", values = c(0:2,15:25)) +
  scale_color_manual(name ="Country", values = rainbow(13)) +
  stat_ellipse() 

R Markdown ggplot2 From line 301 of raw/Apis-mellifera-wings-EU.rmd

CVA.class <- typprobClass(sample.cva$CVscores, groups = as.factor(sample.pca.scores$country), outlier = 0)
print(CVA.class)

R Markdown From line 327 of raw/Apis-mellifera-wings-EU.rmd

sample.class <- data.frame(sample = sample.geo.data$sample, 
                          country = sample.geo.data$country, 
                          latitude = sample.geo.data$latitude, 
                          longitude = sample.geo.data$longitude, 
                          classified.as = CVA.class$groupaffinCV)

# add column indicating incorrect classification
sample.class$error <- ifelse(sample.class$country == sample.class$classified.as, "OK", "error")

# only the incorrectly classified samples
sample.error <- sample.class[sample.class$error == "error", 
                            c("sample", "country", "latitude", "longitude","classified.as")]

# incorrect countries classification map
ggplot(data = world) +
  geom_sf() +
  geom_point(data = sample.error, 
             aes(x = longitude, y = latitude, 
                 colour = classified.as, 
                 shape = classified.as), size = 2) +
  scale_color_manual(name ="Classified as", values = rainbow(13)) +
  scale_shape_manual(name ="Classified as", values = c(0:2, 15:25)) +
  coord_sf(xlim = c(-11, 33), ylim = c(34, 56), expand = FALSE) +
  labs(x = "Longitude", y = "Latitude")

R Markdown ggplot2 From line 338 of raw/Apis-mellifera-wings-EU.rmd

# Mahalanobis Distnces between countries
dist <- sample.cva$Dist
MD.dist <- as.matrix(dist$GroupdistMaha)
MD.dist

# Significance of differences betwen countries
MD.prob <- as.matrix(dist$probsMaha)
MD.prob

R Markdown From line 367 of raw/Apis-mellifera-wings-EU.rmd

# UPGMA tree
tree.upgma <- upgma(MD.dist)
plot(tree.upgma, label.offset = 0.1, cex = 0.5)

R Markdown From line 381 of raw/Apis-mellifera-wings-EU.rmd

# calculate geographical distance between countries
country.geo.data <- aggregate(cbind(sample.geo.data$latitude, sample.geo.data$longitude), 
                             by = list(sample.geo.data$country), FUN = mean)
country.geo.data <- reshape::rename(country.geo.data, 
                                    c(Group.1 = "country", V1 = "latitude", V2 = "longitude")) 

geo.dist <- distm(country.geo.data[c("longitude","latitude")], fun = distGeo)
row.names(geo.dist) <- country.geo.data$country
colnames(geo.dist) <- country.geo.data$country
geo.dist <- geo.dist / 1000 # convert meters to kilometers
geo.dist

vegan::mantel(geo.dist, MD.dist, permutations = 9999, method = "spearman")

# convert distance table to get distance in one column
MD.distance <- as.dist(MD.dist)
MD.distance <- as.numeric(MD.distance)
MD.distance <- data.frame(t(combn(rownames(MD.dist),2)), MD.distance)

geo.distance <- as.dist(geo.dist)
geo.distance <- as.numeric(geo.distance)
geo.distance <- data.frame(t(combn(rownames(geo.dist),2)), geo.distance)

geo.distance$MD.distance <- MD.distance$MD.distance
geo.distance$dist.name <- paste0(geo.distance$X1, "-", geo.distance$X2)

outliers <- subset(geo.distance, dist.name %in% c("AT-HU","AT-HR", "AT-SI"))

ggplot(geo.distance, aes(x = geo.distance, y = MD.distance)) +
  geom_point() +
  coord_trans(x = "log10") +
  geom_text( data = outliers, aes(x = geo.distance , y = MD.distance, label = dist.name), hjust =-0.2) +
  xlab("Geographical distance (km)") + ylab("Mahalanobis distance between wing shape")

R Markdown ggplot2 From line 393 of raw/Apis-mellifera-wings-EU.rmd

# add column with regions consisting of well represented countries or pairs of neighboring countries
sample.aligned$region <- ifelse(sample.aligned$country == "ES"
                                     | sample.class$country == "PT"
                                     , "ES-PT", sample.aligned$country)
sample.aligned$region <- ifelse(sample.aligned$country == "RO"
                                     | sample.class$country == "MD"
                                     , "RO-MD", sample.aligned$region)
sample.aligned$region <- ifelse(sample.aligned$country == "HR"
                                     | sample.class$country == "SI"
                                     , "HR-SI", sample.aligned$region)

# exclude countries with sample size smaller than 25
sample.region <- sample.aligned[sample.aligned$country != "AT"
                               & sample.aligned$country != "HU"
                               & sample.aligned$country != "ME"
                               & sample.aligned$country != "RS", ]

sample.3D <- arrayspecs(sample.region[xy.Names], p, k)
dimnames(sample.3D)[[3]] <- sample.region$sample

# use equal prior probability for all groups
n.region <- length(unique(sample.region$region)) # number of groups
region.cva <- CVA(sample.3D, sample.region$region, rounds = 10000, cv = TRUE, 
                  prior = rep(1/n.region, n.region))
region.cva.scores <- as.data.frame(region.cva$CVscores)
# remove unwanted spaces in variable names otherwise use `CV 1`
colnames(region.cva.scores) <- gsub(" ", "", colnames(region.cva.scores)) 

ggplot(region.cva.scores, aes(x = CV1, y = CV2, shape = sample.region$region, color = sample.region$region)) +
  geom_point() +
  scale_shape_manual(name ="Region", values = c(0:2,19,5:6)) +
  scale_color_manual(name ="Region", values = rainbow(6)) +
  stat_ellipse() 

CVA.class.region <- typprobClass(region.cva$CVscores, groups = as.factor(sample.region$region), outlier = 0)
print(CVA.class.region)

R Markdown ggplot2 From line 431 of raw/Apis-mellifera-wings-EU.rmd

wings.lin <- read.csv("https://zenodo.org/record/7567336/files/Nawrocka_et_al2018.csv", header = TRUE)

# extract sample classifier
wings.lin <- tidyr::separate(
  data = wings.lin,
  col = file,
  sep = c(10), # sample name is in the first 10 characters of file name
  into = c("sample", NA),
  remove = FALSE
) 

# extract lineage classifier
wings.lin <- tidyr::separate(
  data = wings.lin,
  col = file,
  sep = c(1), # lineage code is in the first character of file name
  into = c("lineage", NA),
  remove = FALSE
) 
head(wings.lin, 2)

R Markdown From line 475 of raw/Apis-mellifera-wings-EU.rmd

# Convert 2D array into a 3D array
wings.lin.3D <- arrayspecs(wings.lin[xyNames], p, k)
dimnames(wings.lin.3D)[[3]] <- wings.lin$file
# Align the coordinates using Generalized Procrustes Analysis 
GPA.lin <- gpagen(wings.lin.3D, print.progress = FALSE)
# Convert 3D array into a 2D array - opposite to arrayspecs
wings.lin.aligned <- two.d.array(GPA.lin$coords) 
head(wings.lin.aligned, 2)

R Markdown From line 503 of raw/Apis-mellifera-wings-EU.rmd

sample.lin.aligned <- aggregate(wings.lin.aligned, by = list(wings.lin$sample), FUN = mean)
names(sample.lin.aligned)[names(sample.lin.aligned) == "Group.1"] <- "sample" # rename column

# extract lineage code as classifier
sample.lin.aligned <- tidyr::separate(
    data = sample.lin.aligned,
    col = sample,
    sep = c(1),
    into = c("lineage", NA),
    remove = FALSE
  ) 

geo.data.lin <- read.csv("https://zenodo.org/record/7567336/files/Nawrocka_et_al2018-geo-data.csv", header = TRUE)
sample.lin.aligned$subspecies <- geo.data.lin$subspecies
sample.lin.aligned$country <- geo.data.lin$country
head(sample.lin.aligned, 2)

R Markdown From line 516 of raw/Apis-mellifera-wings-EU.rmd

# Convert 2D array into a 3D array for easier analysis of unknown samples
sample.lin.3D <- arrayspecs(sample.lin.aligned[xy.Names], p, k)
dimnames(sample.lin.3D)[[3]] <- sample.lin.aligned$sample

# use equal prior probability for all groups
n.lin <- length(unique(sample.lin.aligned$lineage)) # number of groups
sample.lin.cva <- CVA(sample.lin.3D, sample.lin.aligned$lineage, rounds = 10000, cv = TRUE, 
                      prior = rep(1/n.lin, n.lin))
sample.lin.cva.scores <- as.data.frame(sample.lin.cva$CVscores)
# remove unwanted spaces in variable names otherwise use `CV 1`
colnames(sample.lin.cva.scores) <- gsub(" ", "", colnames(sample.lin.cva.scores)) 

sample.lin.cva.scores <- cbind(sample.lin.aligned$lineage, sample.lin.cva.scores)
names(sample.lin.cva.scores)[1] <- "lineage" # rename column
rownames(sample.lin.cva.scores) <- sample.lin.aligned$sample

ggplot(sample.lin.cva.scores, aes(x = CV1, y = CV2, color = lineage)) +
  geom_point() +
  coord_fixed() +
  stat_ellipse() 

R Markdown ggplot2 From line 537 of raw/Apis-mellifera-wings-EU.rmd

# Convert 2D array into a 3D array
sample.only <- sample.aligned[xy.Names]
unknown.wings <- arrayspecs(sample.only, p, k)
dimnames(unknown.wings)[[3]] <- sample.only$sample

# calculate covarience for each lineage
covariances <- lapply(unique(sample.lin.cva.scores$lineage),
                      function(x)
                        cov(sample.lin.cva.scores[sample.lin.cva.scores$lineage==x,-1]))
means <- aggregate(sample.lin.cva.scores[,names(sample.lin.cva.scores) != "lineage"], 
                  list(sample.lin.cva.scores$lineage), FUN=mean)
rownames(means) <- means$Group.1
means <- means[,names(means) != "Group.1"]

# Projection of the samples to canonical variate space from Nawrocka et al. 2018
groups <- rownames(means)
result.list <- vector(mode = "list", length = nrow(sample.aligned))
CV.list <- vector(mode = "list", length = nrow(sample.aligned))
for (r in 1:nrow(sample.aligned)) { 
  # Align unknown consensus with consensus from reference samples
  unknown.OPA <- procOPA(GPA.lin$consensus, unknown.wings[,,r])
  unknown.aligned <- unknown.OPA$Bhat
  CV.row <- predict(sample.lin.cva, unknown.aligned)

  # create empty list for results
  result <- numeric(length(groups))
  for (i in 1:length(groups)) { 
    MD <- mahalanobis(CV.row, unlist(means[i, ]), as.matrix(covariances[[i]]))
    result[i] <- MD
  }
  result.list[[r]] <- result
  CV.list[[r]] <- CV.row
}
CV.tab <- do.call(rbind, CV.list)
# remove unwanted spaces in variable names otherwise use `CV 1`
colnames(CV.tab) <- gsub(" ", "", colnames(CV.tab)) 
CV.tab <- as.data.frame(CV.tab)
CV.tab <- cbind(CV.tab, sample.aligned$country)
colnames(CV.tab) <- gsub("sample.aligned\\$", "", colnames(CV.tab)) 

# Black ellipses A, C, M and O indicate 95% confidence regions of reference samples from Nawrocka et al. 2018
ggplot(CV.tab, aes(x = CV1, y = CV2, shape = country, color = country))+
  geom_point() +
  scale_shape_manual(name ="Country", values = c(0:2,15:25)) +
  scale_color_manual(name ="Country", values = rainbow(13)) +
  stat_ellipse() +
  stat_ellipse(data = subset(sample.lin.cva.scores, lineage == "A"), 
               mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + 
  stat_ellipse(data = subset(sample.lin.cva.scores, lineage == "C"), 
               mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + 
  stat_ellipse(data = subset(sample.lin.cva.scores, lineage == "M"), 
               mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + 
  stat_ellipse(data = subset(sample.lin.cva.scores, lineage == "O"), 
               mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + 
  geom_label(means, 
             mapping = aes(x = CV1, y = CV2, label = rownames(means)), inherit.aes = FALSE)


MD.tab <- do.call(rbind, result.list)
MD.tab <- sqrt(MD.tab)
# column names for lineages
colnames(MD.tab) <- groups
MD.tab <- as.data.frame(MD.tab)

lineage <- colnames(MD.tab)[apply(MD.tab, 1, which.min)]
# final column names
colnames(MD.tab) <- paste0("MD.",groups)
MD.tab <- cbind(MD.tab, lineage)
rownames(MD.tab) <- sample.aligned$sample

# frequencies of the lineages
table(MD.tab$lineage)
# frequencies of the lineage in countries
table(sample.aligned$country, MD.tab$lineage)

sample.lineages <- MD.tab
sample.lineages <- cbind(sample.lineages, 
                        sample.geo.data$longitude, sample.geo.data$latitude)
colnames(sample.lineages) <- gsub("sample.geo.data\\$", "", colnames(sample.lineages)) 

# Classification of the samples to lineages according to <a href="#Nawrocka et al. 2018">Nawrocka et al. 2018</a>
ggplot(data = world) +
  geom_sf() +
  geom_point(data = sample.lineages, 
             aes(x = jitter.x, y = jitter.y, colour = lineage), size = 1) +
  scale_color_manual(name ="Lineage", values = rainbow(4)) +
  coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
  xlab("Longitude") + ylab("Latitude")

# Lineage A
# Mahalanobis Distance to lineage A (with jitter to show multiple samples from one location)
ggplot(data = world) +
  geom_sf() +
  geom_point(data = sample.lineages, 
             aes(x = jitter.x, y = jitter.y, colour = MD.A), size = 1) +
  coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
  scale_color_viridis(name = "Distance to\nlineage A", direction = -1) +
  xlab("Longitude") + ylab("Latitude")

# spatial interpolation of the above data using GAM
GAM <- mgcv::gam(MD.A ~ s(longitude, latitude), data = sample.lineages)
viz.GAM <- getViz(GAM)
GAM.A <- plot(viz.GAM, 1) +
  l_fitRaster() +
  l_fitContour(color ="grey30") + 
  l_points() +
  labs(title = NULL) +
  scale_fill_continuous(type = "viridis", direction = -1, name = "Distance to\nlineage A", na.value = "transparent") +
  scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
  scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
  geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
            color ="black", inherit.aes = FALSE)
GAM.A

# Lineage C
# Mahalanobis Distance to lineage C (with jitter to show multiple samples from one location)
ggplot(data = world) +
  geom_sf() +
  geom_point(data = sample.lineages, 
             aes(x = jitter.x, y = jitter.y, colour = MD.C), size = 1) +
  coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
  scale_color_viridis(name = "Distance to\nlineage C", direction = -1) +
  xlab("Longitude") + ylab("Latitude")

# spatial interpolation of the above data using GAM
GAM <- mgcv::gam(MD.C ~ s(longitude, latitude), data = sample.lineages)
viz.GAM <- getViz(GAM)
GAM.C <- plot(viz.GAM, 1) +
  l_fitRaster() +
  l_fitContour(color ="grey30") + 
  l_points() +
  labs(title = NULL) +
  scale_fill_continuous(type = "viridis", direction = -1, name = "Distance to\nlineage C", na.value = "transparent") +
  scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
  scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
  geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
            color ="black", inherit.aes = FALSE)
GAM.C

# Lineage M
# Mahalanobis Distance to lineage M (with jitter to show multiple samples from one location)
ggplot(data = world) +
  geom_sf() +
  geom_point(data = sample.lineages, 
             aes(x = jitter.x, y = jitter.y, colour = MD.M), size = 1) +
  coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
  scale_color_viridis(name = "Distance to\nlineage M", direction = -1) +
  xlab("Longitude") + ylab("Latitude")

# spatial interpolation of the above data using GAM
GAM <- mgcv::gam(MD.M ~ s(longitude, latitude), data = sample.lineages)
viz.GAM <- getViz(GAM)
GAM.M <- plot(viz.GAM, 1) +
  l_fitRaster() +
  l_fitContour(color ="grey30") + 
  l_points() +
  labs(title = NULL) +
  scale_fill_continuous(type = "viridis", direction = -1, name = "Distance to\nlineage M", na.value = "transparent") +
  scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
  scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
  geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
            color ="black", inherit.aes = FALSE)
GAM.M

# Lineage O
# Mahalanobis Distance to lineage O (with jitter to show multiple samples from one location)
ggplot(data = world) +
  geom_sf() +
  geom_point(data = sample.lineages, 
             aes(x = jitter.x, y = jitter.y, colour = MD.O), size = 1) +
  coord_sf(xlim = c(-11, 32), ylim = c(34, 57), expand = FALSE) +
  scale_color_viridis(name = "Distance to\nlineage O", direction = -1) +
  xlab("Longitude") + ylab("Latitude")

# spatial interpolation of the above data using GAM
GAM <- mgcv::gam(MD.O ~ s(longitude, latitude), data = sample.lineages)
viz.GAM <- getViz(GAM)
GAM.O <- plot(viz.GAM, 1) +
  l_fitRaster() +
  l_fitContour(color ="grey30") + 
  l_points() +
  labs(title = NULL) +
  scale_fill_continuous(type = "viridis", direction = -1, name = "Distance to\nlineage O", na.value = "transparent") +
  scale_x_continuous(limits = c(-11, 32), expand = c(0, 0)) +
  scale_y_continuous(limits = c(34, 57), expand = c(0, 0)) +
  geom_path(data = map_data("world"), aes(x = long, y = lat, group = group),
            color ="black", inherit.aes = FALSE)
GAM.O

R Markdown ggplot2 Consensus From line 563 of raw/Apis-mellifera-wings-EU.rmd

# use samples from lineages M and C as unknown
sample.unknown <- subset(sample.lin.aligned, lineage == "M" | lineage == "C")

# convert 2D array into a 3D array
unknown.lin <- arrayspecs(sample.unknown[xy.Names], p, k)
dimnames(unknown.lin)[[3]] <- sample.unknown$sample

# calculate CVA scores for unknown samples
CV.list <- vector(mode = "list", length = nrow(sample.unknown))
for (r in 1:nrow(sample.unknown)) { 
  # Align unknown consensus with consensus from reference samples
  unknown.OPA <- procOPA(GPA$consensus, unknown.lin[,,r])
  unknown.aligned <- unknown.OPA$Bhat
  CV.row <- predict(region.cva, unknown.aligned)
  CV.list[[r]] <- CV.row
}
CV.tab <- do.call(rbind, CV.list)
# remove unwanted spaces in variable names otherwise use `CV 1`
colnames(CV.tab) <- gsub(" ", "", colnames(CV.tab)) 
CV.tab <- as.data.frame(CV.tab)

outlier.threshold <- 0.001
typprobs <- typprobClass(CV.tab, region.cva$CVscores, 
                         groups = as.factor(sample.region$region), 
                         outlier = outlier.threshold, sep = FALSE)
print(typprobs)

# probability of classification for all groups
P.tab <- typprobs$probs
rownames(P.tab) <- sample.unknown$sample
# for each sample find maximum probability
P.max <- apply(P.tab, 1, FUN = max)
# for each sample find region with larges probability
region.max <- colnames(P.tab)[apply(P.tab, 1, which.max)]
# samples with probability below 0.001 are considered as outlines
P.tab.max <- data.frame(country = sample.unknown$country, P.max, region.max)
head(P.tab.max, 2)

# frequencies of the countries in regions without detection of outliers
table(P.tab.max$country, P.tab.max$region.max)

P.tab.max$region.none <- ifelse(P.max < outlier.threshold, "none", region.max)
# frequencies of the countries in regions with detection of outliers
table(P.tab.max$country, P.tab.max$region.none)


# regions labels
region.cva.scores$region <- sample.region$region
region.cva.means <- aggregate(region.cva.scores[,names(region.cva.scores) != "region"], 
                  list(region.cva.scores$region), FUN = mean)
rownames(region.cva.means) <- region.cva.means$Group.1
region.cva.means <- region.cva.means[,names(region.cva.means) != "Group.1"]

ggplot(CV.tab, aes(x = CV1, y = CV2, shape = sample.unknown$subspecies, color = sample.unknown$subspecies)) +
  geom_point() +
  scale_shape_manual(name ="subspecies", values = c(0:5)) +
  scale_color_manual(name ="subspecies", values = rainbow(6)) +
  stat_ellipse() +
  stat_ellipse(data = subset(region.cva.scores, region == "ES-PT"), 
               mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + 
  stat_ellipse(data = subset(region.cva.scores, region == "GR"), 
               mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + 
  stat_ellipse(data = subset(region.cva.scores, region == "HR-SI"), 
               mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + 
  stat_ellipse(data = subset(region.cva.scores, region == "PL"), 
               mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + 
  stat_ellipse(data = subset(region.cva.scores, region == "RO-MD"), 
               mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + 
  stat_ellipse(data = subset(region.cva.scores, region == "TR"), 
               mapping = aes(x = CV1, y = CV2), inherit.aes = FALSE) + 
  geom_label(region.cva.means, 
             mapping = aes(x = CV1, y = CV2, label = rownames(region.cva.means)), inherit.aes = FALSE)