Hardeep Kaur
University of Agriculture in Krakow, Poland
Sajad Ahmad Ganie
Sher-e-Kashmir University of Agricultural Sciences and Technology of
Srinagar, India
Adam
Tofilski
University of Agriculture in Krakow, Poland
# calculations
library(geomorph) # GPA
library(shapes) # OPA
library(MASS) # LDA
# plotting and visualization
library(ggplot2) # plots
ggplot2::theme_set(theme_light())
library(ggforce) # geom_ellipse
library(ggrepel) # geom_label_repel
library(rnaturalearth) # maps
library(raster) # raster
library(ggspatial) # annotation_scale
# input and output
library(readxl) # read XLSX
library(officer) # read DOCX
library(xml2) # read XML
library(XML) # write XML
options(digits=4) # number of digits to display
In order to avoid mistakes it is safer to use defined variables instead of numbers, for example, columns names instead of indexes
p <- 19 # number of landmarks
k <- 2 # number of dimensions, in this case 2 for coordinates (x, y)
# create coordinates names used by IdentiFly
xyNames <- c("x1", "y1")
for (i in 2:p) {
xyNames <- c(xyNames, paste0("x", i))
xyNames <- c(xyNames, paste0("y", i))
}
xyNames
## [1] "x1" "y1" "x2" "y2" "x3" "y3" "x4" "y4" "x5" "y5" "x6" "y6"
## [13] "x7" "y7" "x8" "y8" "x9" "y9" "x10" "y10" "x11" "y11" "x12" "y12"
## [25] "x13" "y13" "x14" "y14" "x15" "y15" "x16" "y16" "x17" "y17" "x18" "y18"
## [37] "x19" "y19"
# define which landmarks are connected by lines in wireframe graph
link.x <- c(1, 1, 2, 2, 3, 3, 4, 4, 5, 6, 7, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 14,
15, 16, 17)
link.y <- c(2, 3, 4, 5, 6, 19, 6, 10, 12, 8, 8, 14, 19, 9, 10, 15, 11, 12, 16, 13,
18, 15, 16, 17, 18)
links.apis <- cbind(link.x, link.y)
# define Bustamente links
link.x <- c(1, 1, 2, 3, 3, 4, 4, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14,
15, 15, 16, 17)
link.y <- c(2, 6, 3, 4, 8, 5, 10, 12, 7, 8, 16, 9, 10, 14, 11, 12, 13, 19, 14, 15,
17, 18, 19, 17, 18)
links.Bustamente <- cbind(link.x, link.y)
# GP1 <- gridPar(pt.bg = 'blue', link.col = 'blue', pt.size = 1, tar.pt.bg
# ='red', tar.link.col ='red')
# verify if vector is a sequence c(1, 2, 3, ...)
is.123 <- function(x) {
x[1] == 1 & all(diff(x) == rep(1, length(x) - 1))
}
# reorder landmarks using column names
reorder <- function(in.data, new.order) {
# is new.order a correct sequence
sorted <- new.order[order(new.order)]
if (!is.123(sorted))
return()
# are all column names present?
if (!all(xyNames %in% colnames(in.data)))
return()
# are column names unique?
if (any(duplicated(colnames(in.data))))
return()
xy <- paste0("x", new.order[1])
xy <- c(xy, paste0("y", new.order[1]))
reordered <- matrix(in.data[, xy], ncol = 2)
for (i in 2:length(new.order)) {
xy <- paste0("x", new.order[i])
xy <- c(xy, paste0("y", new.order[i]))
reordered <- cbind(reordered, matrix(in.data[, xy], ncol = 2))
}
colnames(reordered) <- xyNames
rownames(reordered) <- rownames(in.data)
return(reordered)
}
# extract matrix from XML
XML2matrix <- function(inXML) {
# extract header
header <- xml_find_first(inXML, ".//header")
header.txt <- xml_text(header)
header.txt <- trimws(header.txt)
header.val <- strsplit(header.txt, "\t")
# extract rows
mat.rows.XML <- xml_find_all(inXML, ".//vector")
row.list <- list()
row.names <- vector()
for (i in 1:length(mat.rows.XML)) {
row.txt <- xml_text(mat.rows.XML[[i]])
row.txt <- trimws(row.txt)
row.val <- strsplit(row.txt, "\t")
row.val <- as.numeric(row.val[[1]])
row.list[[i]] <- row.val
# extract row attribute
row.atr <- xml_attr(mat.rows.XML[[i]], "id")
row.atr <- trimws(row.atr)
row.names <- c(row.names, row.atr)
}
mat <- do.call(rbind, row.list)
colnames(mat) <- header.val[[1]]
rownames(mat) <- row.names
return(mat)
}
# convert vector to XML
vector2XML <- function(vec.in, id, XML.out) {
vec.in <- as.character(vec.in)
vec.in <- paste(vec.in, collapse = "\t")
vec.in <- c("\t", vec.in, "\t") # pad with tabulators for spreadsheet
XML.out$addTag("vector", vec.in, attrs = c(id = id))
}
# convert matrix to XML
matrix2XML <- function(mat.in, id, XML.out) {
XML.out$addTag("matrix", close = FALSE, attrs = c(id = id))
header <- as.character(colnames(mat.in))
header <- paste(header, collapse = "\t")
header <- c("\t", header, "\t")
XML.out$addTag("header", header)
for (i in 1:nrow(mat.in)) {
vector2XML(mat.in[i, ], rownames(mat.in)[i], XML.out)
}
XML.out$closeTag() # matrix
}
# create classification data in XML format
GM.data2XML <- function(data, grouping) {
# Error detection
if ((ncol(data)%%2) != 0)
stop("number of columns should be even")
p <- ncol(data)/2
k <- 2
# create coordinates names used by IdentiFly
xyNames <- c("x1", "y1")
for (i in 2:p) {
xyNames <- c(xyNames, paste0("x", i))
xyNames <- c(xyNames, paste0("y", i))
}
## Alignment of landmark configurations
# Convert 2D array into a 3D array
data.3D <- arrayspecs(data, p, k)
# Align the coordinates using Generalized Procrustes Analysis
GPA <- gpagen(data.3D, print.progress = FALSE)
# Convert 3D array into a 2D array - opposite to arrayspecs
data <- two.d.array(GPA$coords)
colnames(data) <- xyNames
XML <- xmlOutputDOM("identifly", attrs = c(version = "1.8"))
# XML$addTag('prototype', close=TRUE, attrs =
# c(file='apis-worker-prototype.dw.png'))
XML$addTag("lda", close = FALSE)
reference <- colMeans(data)
reference <- as.matrix(t(reference))
colnames(reference) <- xyNames
rownames(reference) <- c("reference")
matrix2XML(reference, "reference", XML)
means <- aggregate(data, by = list(grouping), FUN = mean)
rownames(means) <- means$Group.1
means <- subset(means, select = -c(Group.1))
colnames(means) <- xyNames
matrix2XML(means, "means", XML)
PCA <- prcomp(data[, xyNames], center = TRUE, scale. = FALSE)
data <- as.matrix(data) %*% PCA$rotation
data <- data[, 1:(2 * p - 4)]
n <- length(unique(grouping)) # number of groups
LDA <- lda(data, grouping, prior = rep(1/n, n))
XML$addTag("covariances", close = FALSE)
scores <- as.matrix(data) %*% LDA$scaling
scores <- as.data.frame(scores)
covariances <- lapply(unique(grouping), function(x) cov(scores[grouping == x,
]))
groups <- rownames(means)
for (i in 1:length(groups)) {
matrix2XML(covariances[[i]], groups[i], XML)
}
XML$closeTag() # covariances
# coefficients = t(LDA$scaling)
coefficients <- t(PCA$rotation[, 1:(2 * p - 4)] %*% LDA$scaling)
colnames(coefficients) <- xyNames
matrix2XML(coefficients, "coefficients", XML)
XML$closeTag() # lda
return(XML)
}
# read classification data from XML file
XML2GM.id.data <- function(path) {
# add error checking
XML <- read_xml(path)
# extract lda
lda.XML <- xml_find_first(XML, ".//lda")
matrices <- xml_find_all(lda.XML, ".//matrix")
# extract reference
reference.XML <- matrices[xml_attr(matrices, "id") == "reference"]
reference.mat <- XML2matrix(reference.XML)
# extract means
means.XML <- matrices[xml_attr(matrices, "id") == "means"]
means.mat <- XML2matrix(means.XML)
# extract covariances
row.names <- rownames(means.mat)
# create list of empty dataframes
covariances <- list(rep(matrix(), length(row.names)))
for (i in 1:length(row.names)) {
cov.XML <- matrices[xml_attr(matrices, "id") == row.names[i]]
cov.mat <- XML2matrix(cov.XML)
covariances[[i]] <- cov.mat
names(covariances)[i] <- row.names[i]
}
# extract coefficients
coefficients.XML <- matrices[xml_attr(matrices, "id") == "coefficients"]
coefficients.mat <- XML2matrix(coefficients.XML)
return(list(reference = reference.mat, means = means.mat, covariances = covariances,
coefficients = coefficients.mat))
}
# classify unknown samples using XML data
XML2id <- function(path, data, average = TRUE) {
id.data <- XML2GM.id.data(path)
# calculate means LD by matrix multiplication
means.LD <- id.data$means %*% t(id.data$coefficients)
if (ncol(data) != ncol(id.data$reference))
stop("number of columns differes between reference and data")
p <- ncol(data)/2
k <- 2
# Convert from 2D to 3D array
data.3D <- arrayspecs(data, p, k)
# convert reference from 1D to 2D
reference <- matrix(id.data$reference, ncol = 2, byrow = TRUE)
if (average) {
# Align the coordinates using Generalized Procrustes Analysis
GPA <- gpagen(data.3D, print.progress = FALSE)
# Consensus configuration of the unknown sample
unknown.consensus <- GPA$consensus
# Align unknown consensus with consensus from reference samples
unknown.OPA <- procOPA(reference, unknown.consensus)
unknown.aligned <- unknown.OPA$Bhat
# convert aligned data from 2D to 1D
unknown.aligned <- matrix(t(unknown.aligned), nrow = 1)
# calculate LD scores
unknown.LD.sores <- unknown.aligned %*% t(id.data$coefficients)
id <- classifyVecLD(unknown.LD.sores, means.LD, id.data$covariances)
id <- id$class
unknown.LD.sores <- as.data.frame(unknown.LD.sores)
plot <- cov.plot(means.LD, id.data$covariances) + geom_point(unknown.LD.sores,
mapping = aes(x = LD1, y = LD2, colour = "zzz")) + geom_label_repel(unknown.LD.sores,
mapping = aes(x = LD1, y = LD2, colour = "zzz", label = "unknown"), nudge_x = 0.75,
nudge_y = 0) + scale_color_manual(values = append(rainbow(nrow(id.data$means)),
"black"))
} else {
# calculate lda scores for all wings
LD.list <- vector(mode = "list", length = nrow(data))
for (r in 1:nrow(data)) {
# Align unknown consensus with consensus from reference samples
unknown.OPA <- procOPA(reference, data.3D[, , r])
unknown.aligned <- matrix(t(unknown.OPA$Bhat), nrow = 1) # convert from 2D to 1D
LD.row <- unknown.aligned %*% t(id.data$coefficients)
LD.list[[r]] <- LD.row
}
LD.tab <- do.call(rbind, LD.list)
rownames(LD.tab) <- rownames(data)
LD.tab <- as.data.frame(LD.tab)
id <- classifyMatLD(LD.tab, means.LD, id.data$covariances)
plot <- cov.plot(means.LD, id.data$covariances) + geom_point(LD.tab, mapping = aes(x = LD1,
y = LD2, colour = "zzz")) + scale_color_manual(values = append(rainbow(nrow(id.data$means)),
"black"))
}
return(list(id = id, plot = plot))
}
# calculate Mahalnanobis distances for data in one vector
classifyVecLD <- function(unknown.LD, means, covariances) {
groups <- rownames(means)
df <- length(groups) - 1
result.list <- list() # empty list for results
max.P <- 0
max.group <- ""
for (i in 1:length(groups)) {
MD <- mahalanobis(unknown.LD, means[i, ], as.matrix(covariances[[i]]))
results <- c(MD)
P <- pchisq(MD, df = df, lower.tail = FALSE)
results <- c(results, P)
if (P > max.P) {
max.P <- P
max.group <- groups[i]
}
result.list[[i]] <- results
}
out.summary <- paste("The sample was classified as", max.group, "with probability",
max.P)
out.max <- data.frame(group = max.group, P = max.P)
out.class <- do.call(rbind, result.list)
colnames(out.class) <- c("MD2", "P")
rownames(out.class) <- rownames(means)
return(list(summary = out.summary, max.group = out.max, class = out.class))
}
# calculate Mahalnanobis distances for each row in a matrix
classifyMatLD <- function(unknown.LD, means, covariances) {
result.list <- list() # empty list for results
for (i in 1:nrow(unknown.LD)) {
out.row <- classifyVecLD(unknown.LD[i, ], means, covariances)
result.list[[i]] <- out.row$max
}
id.results <- do.call(rbind, result.list)
rownames(id.results) <- rownames(unknown.LD)
return(id.results)
}
# plot ellipses for first two LD using data from covariance matrices
cov.plot <- function(means, covariances) {
chi <- sqrt(qchisq(0.95, 2)) # Chi-Square value for probability 95%
groups <- rownames(means)
ellipse.list <- list()
for (i in 1:length(groups)) {
x0 <- means[i, 1]
y0 <- means[i, 2]
cov.mat <- covariances[[i]]
cov.mat <- cov.mat[1:2, 1:2]
cov.eigen <- eigen(cov.mat)
a <- chi * sqrt(cov.eigen$values[1])
b <- chi * sqrt(cov.eigen$values[2])
angle <- atan2(cov.eigen$vectors[2, 1], cov.eigen$vectors[1, 1])
ellipse <- c(x0, y0, a, b, angle)
ellipse.list[[i]] <- ellipse
}
ellipses <- do.call(rbind, ellipse.list)
colnames(ellipses) <- c("LD1", "LD2", "a", "b", "angle")
rownames(ellipses) <- groups
ellipses <- as.data.frame(ellipses)
ggplot(ellipses, aes(x = LD1, y = LD2, color = rownames(ellipses))) + geom_point() +
coord_fixed() + geom_ellipse(ellipses, mapping = aes(x0 = LD1, y0 = LD2,
a = a, b = b, angle = angle)) + geom_label_repel(aes(label = rownames(ellipses),
size = NULL), nudge_y = 0.75) + theme(legend.position = "none")
}
The classification is based on tabular data obtained from Bustamante et al. (2021)
# download ESM1 from Bustamente et al. 2021 and save as
# Bustamente2021-ESM1.docx
download.file("https://static-content.springer.com/esm/art%3A10.1007%2Fs13592-021-00857-7/MediaObjects/13592_2021_857_MOESM1_ESM.docx",
destfile = "Bustamente2021-ESM1.docx", method = "curl")
# download ESM2 from Bustamente et al. 2021 and save as
# Bustamente2021-ESM2.xlsx
download.file("https://static-content.springer.com/esm/art%3A10.1007%2Fs13592-021-00857-7/MediaObjects/13592_2021_857_MOESM2_ESM.xlsx",
destfile = "Bustamente2021-ESM2.xlsx", method = "curl")
# download ESM3 from Bustamente et al. 2021 and save as
# Bustamente2021-ESM3.xlsx
download.file("https://static-content.springer.com/esm/art%3A10.1007%2Fs13592-021-00857-7/MediaObjects/13592_2021_857_MOESM3_ESM.xlsx",
destfile = "Bustamente2021-ESM3.xlsx", method = "curl")
# extract reference configuration from sheet 'Aligned Sample' in ESM3
Bustamente2021.reference <- read_excel("Bustamente2021-ESM3.xlsx", sheet = "Aligned Sample",
range = "A1:AL2")
Bustamente2021.reference <- as.data.frame(Bustamente2021.reference)
colnames(Bustamente2021.reference) <- xyNames
rownames(Bustamente2021.reference) <- c("reference")
reference.3D <- arrayspecs(Bustamente2021.reference, p, k)
# plot landmarks from Bustamante et al. (2021) the landmarks differ from Fig. 1
# (Bustamante et al. 2021) where they were flipped horizontally
plotAllSpecimens(reference.3D, links = links.Bustamente, label = TRUE, plot.param = list(pt.bg = "black",
pt.cex = 0.5, mean.bg = "red", mean.cex = 1, link.col = "red", txt.pos = 3, txt.cex = 1))
# extract coefficients from sheet 'LD's' in ESM3
Bustamente2021.coefficients <- read_excel("Bustamente2021-ESM3.xlsx", sheet = "LD's",
range = "A1:I39")
Bustamente2021.coefficients <- as.data.frame(Bustamente2021.coefficients)
Bustamente2021.coefficients <- Bustamente2021.coefficients[, -c(1)]
Bustamente2021.coefficients <- t(Bustamente2021.coefficients)
colnames(Bustamente2021.coefficients) <- xyNames
# # Supplementary Table S2 in Bustamente et al. 2021 contains mean landmark
# coordinates for all species Bustamente2021.means =
# read.csv('Bustamente2021-tabS2.csv') rownames(Bustamente2021.means) =
# Bustamente2021.means$species Bustamente2021.means =
# subset(Bustamente2021.means, select = - c(species))
# extract class means from ESM1; the table is in 4 parts under indexes: 5, 7,
# 10, 13
ESM1 <- read_docx("Bustamente2021-ESM1.docx")
content <- docx_summary(ESM1)
fragment <- content[content$doc_index == 5, ]
fragment <- data.frame(text = fragment$text, row_id = fragment$row_id, cell_id = fragment$cell_id)
tab5 <- reshape(fragment, idvar = "row_id", timevar = "cell_id", direction = "wide")
tab5 <- tab5[-c(1:2), -c(1)]
fragment <- content[content$doc_index == 7, ]
fragment <- data.frame(text = fragment$text, row_id = fragment$row_id, cell_id = fragment$cell_id)
tab7 <- reshape(fragment, idvar = "row_id", timevar = "cell_id", direction = "wide")
tab7 <- tab7[-c(1), -c(1)]
fragment <- content[content$doc_index == 10, ]
fragment <- data.frame(text = fragment$text, row_id = fragment$row_id, cell_id = fragment$cell_id)
tab10 <- reshape(fragment, idvar = "row_id", timevar = "cell_id", direction = "wide")
tab10 <- tab10[-c(1), -c(1)]
fragment <- content[content$doc_index == 13, ]
fragment <- data.frame(text = fragment$text, row_id = fragment$row_id, cell_id = fragment$cell_id)
tab13 <- reshape(fragment, idvar = "row_id", timevar = "cell_id", direction = "wide")
tab13 <- tab13[-c(1), -c(1)]
rownames(tab5) <- tab5[, 1]
tab5 <- tab5[, -c(1)]
Bustamente2021.means <- cbind(tab5, tab7[, 2:11], tab10[, 2:11], tab13[, 2:9])
colnames(Bustamente2021.means) <- xyNames
Bustamente2021.means[xyNames] <- sapply(Bustamente2021.means[xyNames], as.numeric)
# extract covariances matrices from ESM2; each matrix is in separate sheet
groups <- rownames(Bustamente2021.means)
# create list of empty dataframes
Bustamente2021.covariances <- c(rep(data.frame(), length(groups)))
for (i in 1:length(groups)) {
sheet.name <- paste0(groups[i], " ginv")
Bustamente2021.covariances[[i]] <- read_excel("Bustamente2021-ESM2.xlsx", sheet = sheet.name,
range = "B1:I9")
names(Bustamente2021.covariances)[i] <- groups[i]
}
# calculate linear discriminant scores by matrix multiplication
Bustamente2021.means.LD <- as.matrix(Bustamente2021.means) %*% t(as.matrix(Bustamente2021.coefficients))
cov.plot(Bustamente2021.means.LD, Bustamente2021.covariances)
# Create XML file
XML <- xmlOutputDOM("identifly", attrs = c(version = "1.8"))
XML$addTag("prototype", close = TRUE, attrs = c(file = "Bustamente2021-prototype.dw.png"))
XML$addTag("lda", close = FALSE)
reference <- Bustamente2021.reference
matrix2XML(reference, "reference", XML)
means <- Bustamente2021.means
matrix2XML(means, "means", XML)
XML$addTag("covariances", close = FALSE)
for (i in 1:length(Bustamente2021.covariances)) {
matrix2XML(Bustamente2021.covariances[[i]], names(Bustamente2021.covariances)[i],
XML)
}
XML$closeTag() # covariances
matrix2XML(Bustamente2021.coefficients, "coefficients", XML)
XML$closeTag() # lda
# add prototype for IdentiFly software
XML$addTag("prototype", close = TRUE, attrs = c(file = "Bustamente2021-prototype.dw.png"))
saveXML(XML$value(), file = "apis-species.dw.xml", prefix = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
## [1] "apis-species.dw.xml"
The classification is based on raw data obtained from Nawrocka et al. (2018a) and Nawrocka et al. (2018b)
wings.lin <- read.csv("https://zenodo.org/record/7567336/files/Nawrocka_et_al2018.csv")
# extract sample classifier
wings.lin$sample = substr(wings.lin$file, 1, 10)
# average data within samples
wings.lin <- aggregate(wings.lin[xyNames],
by = list(wings.lin$sample),
FUN = mean)
rownames(wings.lin) <- wings.lin$Group.1
wings.lin <- subset(wings.lin, select = -c(Group.1))
# extract lineage classifier
wings.lin$lineage = substr(rownames(wings.lin), 1, 1)
XML = GM.data2XML(wings.lin[xyNames], # columns with coordinates
wings.lin$lineage) # 1 column with groups
# add prototype for IdentiFly software
XML$addTag("prototype", close=TRUE, attrs = c(file="apis-worker-prototype.dw.png"))
saveXML(XML$value(), file="apis-mellifera-lineage.dw.xml",
prefix = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
## [1] "apis-mellifera-lineage.dw.xml"
# Verify the XML file
id.data = XML2GM.id.data("apis-mellifera-lineage.dw.xml")
means.LD = id.data$means %*% t(id.data$coefficients)
cov.plot(means.LD, id.data$covariances)
The classification is based on raw data obtained from Oleksa et al. (2022), Oleksa et al. (2023) and Nawrocka et al. (2018b)
wings.C = read.csv("https://zenodo.org/record/7244070/files/GR-raw-coordinates.csv")
wings.C = rbind(wings.C, read.csv("https://zenodo.org/record/7244070/files/HR-raw-coordinates.csv"))
wings.C = rbind(wings.C, read.csv("https://zenodo.org/record/7244070/files/MD-raw-coordinates.csv"))
wings.C = rbind(wings.C, read.csv("https://zenodo.org/record/7244070/files/RO-raw-coordinates.csv"))
wings.C = rbind(wings.C, read.csv("https://zenodo.org/record/7244070/files/SI-raw-coordinates.csv"))
wings.C = rbind(wings.C, read.csv("https://zenodo.org/record/7244070/files/TR-raw-coordinates.csv"))
# extract classifier from file name
wings.C$sample = substr(wings.C$file, 1, 7)
# average data within samples
wings.C <- aggregate(wings.C[xyNames],
by = list(wings.C$sample),
FUN = mean)
rownames(wings.C) <- wings.C$Group.1
wings.C <- subset(wings.C, select = -c(Group.1))
# add Italy
data.lin <- read.csv("https://zenodo.org/record/7567336/files/Nawrocka_et_al2018-geo-data.csv")
IT = subset(wings.lin, data.lin$country == "Italy")
IT <- subset(IT, select = -c(lineage))
rownames(IT) <- gsub("C-lig", "IT", rownames(IT))
wings.C = rbind(wings.C, IT)
# extract region classifier
wings.C$country = substr(rownames(wings.C), 1, 2)
wings.C$region = ifelse(wings.C$country == "HR"
| wings.C$country == "SI"
, "HR-SI", wings.C$country)
wings.C$region = ifelse(wings.C$country == "RO"
| wings.C$country == "MD"
, "RO-MD", wings.C$region)
XML = GM.data2XML(wings.C[xyNames], # columns with coordinates
wings.C$region) # 1 column with groups
# add prototype for IdentiFly software
XML$addTag("prototype", close=TRUE, attrs = c(file="apis-worker-prototype.dw.png"))
saveXML(XML$value(), file="apis-mellifera-lineage-c-regions.dw.xml",
prefix = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
## [1] "apis-mellifera-lineage-c-regions.dw.xml"
# Verify the XML file
id.data = XML2GM.id.data("apis-mellifera-lineage-c-regions.dw.xml")
means.LD = id.data$means %*% t(id.data$coefficients)
cov.plot(means.LD, id.data$covariances)
Read raw coordinates of 19 landmarks from Zenodo (Kaur et al., 2023)
wings <- read.csv("https://zenodo.org/record/8071014/files/IN-raw-coordinates.csv")
# extract individual and sample classifier from file name
wings$ind <- substr(wings$file, 1, 14)
wings$sample <- substr(wings$file, 1, 7)
head(wings, 2)
## file x1 y1 x2 y2 x3 y3 x4 y4 x5 y5 x6 y6 x7
## 1 IN-0001-000243-L.dw.png 195 169 213 166 268 246 261 192 267 108 337 250 386
## 2 IN-0001-000243-R.dw.png 180 251 193 243 272 290 231 242 214 166 333 266 392
## y7 x8 y8 x9 y9 x10 y10 x11 y11 x12 y12 x13 y13 x14 y14 x15 y15 x16 y16
## 1 282 369 258 406 224 382 204 422 171 427 131 438 99 451 270 479 235 559 194
## 2 272 367 258 386 210 357 204 379 156 364 120 361 85 448 236 458 189 512 119
## x17 y17 x18 y18 x19 y19 ind sample
## 1 593 191 601 161 82 244 IN-0001-000243 IN-0001
## 2 542 101 539 72 104 369 IN-0001-000243 IN-0001
tmp <- unique(wings$ind)
# Read geographic coordinates
geo.data <- read.csv("https://zenodo.org/record/8071014/files/IN-data.csv")
sample.geo.data <- aggregate(cbind(geo.data$latitude, geo.data$longitude), by = list(geo.data$sample),
FUN = mean)
sample.geo.data <- reshape::rename(sample.geo.data, c(Group.1 = "sample", V1 = "latitude",
V2 = "longitude"))
# Read elevation data In order to download the TIF file uncomment the two lines
# below
# download.file('https://geodata.ucdavis.edu/climate/worldclim/2_1/base/wc2.1_2.5m_elev.zip',
# 'wc2.1_2.5m_elev.zip') unzip('wc2.1_2.5m_elev.zip')
elev <- raster("E:/WorldClim/wc2.1_2.5m_elev.tif")
elev.color <- colorRampPalette(c("green", "lightgreen", "yellow", "orange", "red",
"darkred"))
x.min <- 73
x.max <- 77
y.min <- 32
y.max <- 35
e <- extent(x.min - 1, x.max + 1, y.min - 1, y.max + 1)
elev <- crop(elev, e)
elev.df <- data.frame(rasterToPoints(elev))
colnames(elev.df) <- c("longitude", "latitude", "altitude")
world <- ne_countries(scale = "medium", returnclass = "sf")
ggplot(data = world) + geom_raster(data = elev.df, aes(longitude, latitude, fill = altitude)) +
scale_fill_gradientn(colours = elev.color(100)) + geom_sf(fill = NA) + geom_point(data = sample.geo.data,
aes(x = longitude, y = latitude), size = 2) + coord_sf(xlim = c(x.min, x.max),
ylim = c(y.min, y.max)) + annotation_scale(location = "bl", width_hint = 0.2)
ggplot(data = world) + geom_sf() + annotate("rect", xmin = x.min, xmax = x.max, ymin = y.min,
ymax = y.max, alpha = 0, color = "red") + coord_sf(xlim = c(65, 100), ylim = c(5,
40))
# Convert from 2D array to 3D array
wings.coords <- arrayspecs(wings[xyNames], p, k)
dimnames(wings.coords)[[3]] <- wings$file
# Align the coordinates using Generalized Procrustes Analysis
GPA.IN <- gpagen(wings.coords, print.progress = FALSE)
consensus.IN <- GPA.IN$consensus
# plot landmarks after alignment
plotAllSpecimens(GPA.IN$coords, links = links.apis, label = TRUE, plot.param = list(pt.bg = "black",
pt.cex = 0.5, mean.bg = "red", mean.cex = 1, link.col = "red", txt.pos = 3, txt.cex = 1))
# Convert from 3D array to 2D array
wings.aligned.IN <- two.d.array(GPA.IN$coords)
colnames(wings.aligned.IN) <- xyNames
# Average aligned coordinates within individuals
aligned.ind.IN <- aggregate(wings.aligned.IN, by = list(wings$ind), FUN = mean)
rownames(aligned.ind.IN) <- aligned.ind.IN$Group.1
aligned.ind.IN <- subset(aligned.ind.IN, select = -c(Group.1))
# Average aligned coordinates within samples
sample.aligned <- aggregate(wings.aligned.IN, by = list(wings$sample), FUN = mean)
rownames(sample.aligned) <- sample.aligned$Group.1
sample.aligned <- subset(sample.aligned, select = -c(Group.1))
# PCA based on individuals
ind.pca <- prcomp(aligned.ind.IN[, xyNames], center = TRUE, scale. = TRUE)
ind.pca.scores <- as.data.frame(ind.pca$x)
# create plot labels
variance.tab <- summary(ind.pca)$importance
variance <- variance.tab["Proportion of Variance", "PC1"]
variance <- round(100 * variance, 1)
label.x <- paste0("PC1 (", variance, "%)")
variance <- variance.tab["Proportion of Variance", "PC2"]
variance <- round(100 * variance, 1)
label.y <- paste0("PC2 (", variance, "%)")
ggplot(ind.pca.scores, aes(x = PC1, y = PC2)) + geom_point() + scale_shape_manual(name = "lineage",
values = c(0:2, 15:25)) + scale_color_manual(name = "lineage", values = rainbow(13)) +
stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + stat_ellipse()
stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + +
stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + #
stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + geom_label_repel(aes(label
stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + =
stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + rownames(sample.aligned)))
stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + +
xlab(label.x) + ylab(label.y)
# PCA based on samples
sample.pca <- prcomp(sample.aligned[, xyNames], center = TRUE, scale. = TRUE)
sample.pca.scores <- as.data.frame(sample.pca$x)
# create plot labels
variance.tab <- summary(sample.pca)$importance
variance <- variance.tab["Proportion of Variance", "PC1"]
variance <- round(100 * variance, 1)
label.x <- paste0("PC1 (", variance, "%)")
variance <- variance.tab["Proportion of Variance", "PC2"]
variance <- round(100 * variance, 1)
label.y <- paste0("PC2 (", variance, "%)")
ggplot(sample.pca.scores, aes(x = PC1, y = PC2)) + geom_point() + scale_shape_manual(name = "lineage",
values = c(0:2, 15:25)) + scale_color_manual(name = "lineage", values = rainbow(13)) +
stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + stat_ellipse()
stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + +
stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + #
stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + geom_label_repel(aes(label
stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + =
stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + rownames(sample.aligned)))
stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + +
xlab(label.x) + ylab(label.y)
# configuration of points differ between Bustamente et al. 2018 and Nawrocka et
# al. 2018 reorder input data to be compatible with classification data use
# function defined above
order.Bustamente <- c(18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 4, 3, 5, 2,
1, 19)
sample.unknown <- reorder(as.matrix(aligned.ind.IN), order.Bustamente)
id.data <- XML2id("apis-species.dw.xml", sample.unknown, average = FALSE)
id.data$plot
out.class <- as.data.frame(id.data$id)
# All workers were classified as Apis mellifera
table(out.class$group)
##
## mellifera
## 175
sample.unknown <- reorder(as.matrix(sample.aligned), order.Bustamente)
id.data <- XML2id("apis-species.dw.xml", sample.unknown, average = FALSE)
id.data$plot
id.data$id
## group P
## IN-0001 mellifera 0.2583
## IN-0002 mellifera 0.3717
## IN-0003 mellifera 0.3898
## IN-0004 mellifera 0.6165
## IN-0005 mellifera 0.5427
## IN-0006 mellifera 0.1350
## IN-0007 mellifera 0.2849
## IN-0008 mellifera 0.3883
## IN-0009 mellifera 0.2527
## IN-0010 mellifera 0.2531
id.data <- XML2id("apis-mellifera-lineage.dw.xml", sample.aligned, average = TRUE)
id.data$plot
id.data$id
## MD2 P
## A 38.65 2.057e-08
## C 12.91 4.837e-03
## M 128.67 1.046e-27
## O 69.96 4.355e-15
id.data <- XML2id("apis-mellifera-lineage.dw.xml", sample.aligned, average = FALSE)
id.data$plot
id.data$id
## group P
## IN-0001 C 3.820e-03
## IN-0002 C 1.499e-02
## IN-0003 A 4.833e-05
## IN-0004 C 4.213e-01
## IN-0005 C 3.241e-03
## IN-0006 C 1.032e-01
## IN-0007 C 4.264e-04
## IN-0008 C 2.637e-02
## IN-0009 C 1.351e-05
## IN-0010 C 2.443e-05
id.data <- XML2id("apis-mellifera-lineage-c-regions.dw.xml", sample.aligned, average = TRUE)
id.data$plot
id.data$id
## MD2 P
## GR 22.743 1.425e-04
## HR-SI 8.691 6.930e-02
## IT 54.513 4.109e-11
## RO-MD 12.432 1.441e-02
## TR 69.199 3.350e-14
id.data <- XML2id("apis-mellifera-lineage-c-regions.dw.xml", sample.aligned, average = FALSE)
id.data$plot
id.data$id
## group P
## IN-0001 RO-MD 0.038065
## IN-0002 HR-SI 0.124295
## IN-0003 GR 0.020656
## IN-0004 HR-SI 0.499434
## IN-0005 HR-SI 0.086676
## IN-0006 HR-SI 0.006802
## IN-0007 HR-SI 0.039837
## IN-0008 HR-SI 0.019549
## IN-0009 HR-SI 0.026855
## IN-0010 RO-MD 0.020600
Bustamante, T., Fuchs, S., Grünewald, B., & Ellis, J. D. (2021). A geometric morphometric method and web application for identifying honey bee species (Apis spp.) using only forewings. Apidologie, 52(3), 697-706. https://doi.org/10.1007/s13592-021-00857-7
Kaur H., Ganie S. A., & Tofilski A. (2023). Fore wings of honey bee (Apis mellifera) from Jammu and Kashmir, India [Data set]. Zenodo. https://doi.org/10.5281/zenodo.8071014
Nawrocka, A., Kandemir, İ., Fuchs, S., & Tofilski, A. (2018a). Computer software for identification of honey bee subspecies and evolutionary lineages. Apidologie, 49(2), 172-184. https://doi.org/10.1007/s13592-017-0538-y
Nawrocka, A., Kandemir, İ., Fuchs, S., & Tofiilski, A. (2018b). Dataset: Computer software for identification of honey bee subspecies and evolutionary lineages. Apidologie, 49, 172–184. https://doi.org/10.5281/zenodo.7567336
Oleksa, A., Căuia, E., Siceanu, A., Puškadija, Z., Kovačić, M., Pinto, M.A., Rodrigues, P.J., Hatjina, F., Charistos, L., Bouga, M., Prešern, J., Kandemir, I., Rašić, S., Kusza, S., & Tofilski, A. (2022). Collection of wing images for conservation of honey bees (Apis mellifera) biodiversity in Europe [Data set]. Zenodo. https://doi.org/10.5281/zenodo.7244070
Oleksa, A., Căuia, E., Siceanu, A., Puškadija, Z., Kovačić, M., Pinto, M. A., Rodrigues, P. J., Hatjina, F., Charistos, L., Bouga, M., Prešern, J., Kandemir, İ., Rašić, S., Kusza, S., Tofilski, A. (2023). Honey bee (Apis mellifera) wing images: a tool for identification and conservation. GigaScience, 12, giad019. https://doi.org/10.1093/gigascience/giad019
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19043)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=Polish_Poland.1250 LC_CTYPE=Polish_Poland.1250
## [3] LC_MONETARY=Polish_Poland.1250 LC_NUMERIC=C
## [5] LC_TIME=Polish_Poland.1250
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] XML_3.99-0.9 xml2_1.3.3 officer_0.6.2
## [4] readxl_1.4.0 ggspatial_1.1.6 raster_3.5-21
## [7] sp_1.5-0 rnaturalearth_0.1.0 ggrepel_0.9.1
## [10] ggforce_0.3.3 ggplot2_3.3.6 MASS_7.3-58.1
## [13] shapes_1.2.6 geomorph_4.0.4 Matrix_1.4-1
## [16] rgl_0.109.6 RRPP_1.3.0
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-149 sf_1.0-7 tools_4.0.3
## [4] bslib_0.4.1 utf8_1.2.2 rgdal_1.5-32
## [7] R6_2.5.1 KernSmooth_2.23-17 DBI_1.1.3
## [10] colorspace_2.0-3 withr_2.5.0 rnaturalearthdata_0.1.0
## [13] tidyselect_1.2.0 rematch_1.0.1 compiler_4.0.3
## [16] textshaping_0.3.6 cli_3.3.0 formatR_1.14
## [19] labeling_0.4.2 sass_0.4.1 scales_1.2.1
## [22] classInt_0.4-7 proxy_0.4-27 askpass_1.1
## [25] systemfonts_1.0.4 stringr_1.4.0 digest_0.6.29
## [28] rmarkdown_2.18 base64enc_0.1-3 jpeg_0.1-9
## [31] pkgconfig_2.0.3 htmltools_0.5.3 fastmap_1.1.0
## [34] highr_0.9 htmlwidgets_1.5.4 rlang_1.0.4
## [37] rstudioapi_0.14 jquerylib_0.1.4 farver_2.1.1
## [40] generics_0.1.3 jsonlite_1.8.0 dplyr_1.0.9
## [43] zip_2.2.0 magrittr_2.0.1 s2_1.1.2
## [46] Rcpp_1.0.9 munsell_0.5.0 fansi_1.0.3
## [49] ape_5.4-1 lifecycle_1.0.1 terra_1.6-7
## [52] scatterplot3d_0.3-42 stringi_1.7.8 yaml_2.3.5
## [55] plyr_1.8.7 grid_4.0.3 parallel_4.0.3
## [58] lattice_0.20-41 knitr_1.40 pillar_1.8.1
## [61] uuid_1.1-0 codetools_0.2-16 wk_0.6.0
## [64] glue_1.6.2 evaluate_0.18 vctrs_0.4.1
## [67] tweenr_1.0.2 cellranger_1.1.0 gtable_0.3.1
## [70] openssl_2.0.0 polyclip_1.10-0 reshape_0.8.9
## [73] assertthat_0.2.1 cachem_1.0.6 xfun_0.31
## [76] e1071_1.7-11 ragg_1.2.2 class_7.3-17
## [79] minpack.lm_1.2-2 tibble_3.1.8 units_0.8-0