Affiliations

Hardeep Kaur ORCID logo University of Agriculture in Krakow, Poland
Sajad Ahmad Ganie ORCID logo Sher-e-Kashmir University of Agricultural Sciences and Technology of Srinagar, India
Adam Tofilski ORCID logo University of Agriculture in Krakow, Poland

Libraries

# calculations
library(geomorph) # GPA 
library(shapes) # OPA
library(MASS) # LDA
# plotting and visualization
library(ggplot2) # plots
ggplot2::theme_set(theme_light())
library(ggforce) # geom_ellipse
library(ggrepel) # geom_label_repel
library(rnaturalearth) # maps
library(raster) # raster
library(ggspatial) # annotation_scale
# input and output 
library(readxl) # read XLSX
library(officer) # read DOCX
library(xml2) # read XML
library(XML) # write XML
options(digits=4) # number of digits to display

Variable names

In order to avoid mistakes it is safer to use defined variables instead of numbers, for example, columns names instead of indexes

p <- 19  # number of landmarks
k <- 2  # number of dimensions, in this case 2 for coordinates (x, y)

# create coordinates names used by IdentiFly
xyNames <- c("x1", "y1")
for (i in 2:p) {
  xyNames <- c(xyNames, paste0("x", i))
  xyNames <- c(xyNames, paste0("y", i))
}
xyNames
##  [1] "x1"  "y1"  "x2"  "y2"  "x3"  "y3"  "x4"  "y4"  "x5"  "y5"  "x6"  "y6" 
## [13] "x7"  "y7"  "x8"  "y8"  "x9"  "y9"  "x10" "y10" "x11" "y11" "x12" "y12"
## [25] "x13" "y13" "x14" "y14" "x15" "y15" "x16" "y16" "x17" "y17" "x18" "y18"
## [37] "x19" "y19"
# define which landmarks are connected by lines in wireframe graph
link.x <- c(1, 1, 2, 2, 3, 3, 4, 4, 5, 6, 7, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 14,
  15, 16, 17)
link.y <- c(2, 3, 4, 5, 6, 19, 6, 10, 12, 8, 8, 14, 19, 9, 10, 15, 11, 12, 16, 13,
  18, 15, 16, 17, 18)
links.apis <- cbind(link.x, link.y)

# define Bustamente links
link.x <- c(1, 1, 2, 3, 3, 4, 4, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14,
  15, 15, 16, 17)
link.y <- c(2, 6, 3, 4, 8, 5, 10, 12, 7, 8, 16, 9, 10, 14, 11, 12, 13, 19, 14, 15,
  17, 18, 19, 17, 18)
links.Bustamente <- cbind(link.x, link.y)

# GP1 <- gridPar(pt.bg = 'blue', link.col = 'blue', pt.size = 1, tar.pt.bg
# ='red', tar.link.col ='red')

Functions

# verify if vector is a sequence c(1, 2, 3, ...)
is.123 <- function(x) {
  x[1] == 1 & all(diff(x) == rep(1, length(x) - 1))
}

# reorder landmarks using column names
reorder <- function(in.data, new.order) {
  # is new.order a correct sequence
  sorted <- new.order[order(new.order)]
  if (!is.123(sorted))
    return()
  # are all column names present?
  if (!all(xyNames %in% colnames(in.data)))
    return()
  # are column names unique?
  if (any(duplicated(colnames(in.data))))
    return()

  xy <- paste0("x", new.order[1])
  xy <- c(xy, paste0("y", new.order[1]))
  reordered <- matrix(in.data[, xy], ncol = 2)
  for (i in 2:length(new.order)) {
    xy <- paste0("x", new.order[i])
    xy <- c(xy, paste0("y", new.order[i]))
    reordered <- cbind(reordered, matrix(in.data[, xy], ncol = 2))
  }
  colnames(reordered) <- xyNames
  rownames(reordered) <- rownames(in.data)

  return(reordered)
}

# extract matrix from XML
XML2matrix <- function(inXML) {
  # extract header
  header <- xml_find_first(inXML, ".//header")
  header.txt <- xml_text(header)
  header.txt <- trimws(header.txt)
  header.val <- strsplit(header.txt, "\t")

  # extract rows
  mat.rows.XML <- xml_find_all(inXML, ".//vector")
  row.list <- list()
  row.names <- vector()
  for (i in 1:length(mat.rows.XML)) {
    row.txt <- xml_text(mat.rows.XML[[i]])
    row.txt <- trimws(row.txt)
    row.val <- strsplit(row.txt, "\t")
    row.val <- as.numeric(row.val[[1]])
    row.list[[i]] <- row.val

    # extract row attribute
    row.atr <- xml_attr(mat.rows.XML[[i]], "id")
    row.atr <- trimws(row.atr)
    row.names <- c(row.names, row.atr)
  }
  mat <- do.call(rbind, row.list)
  colnames(mat) <- header.val[[1]]
  rownames(mat) <- row.names
  return(mat)
}

# convert vector to XML
vector2XML <- function(vec.in, id, XML.out) {
  vec.in <- as.character(vec.in)
  vec.in <- paste(vec.in, collapse = "\t")
  vec.in <- c("\t", vec.in, "\t")  # pad with tabulators for spreadsheet
  XML.out$addTag("vector", vec.in, attrs = c(id = id))
}

# convert matrix to XML
matrix2XML <- function(mat.in, id, XML.out) {
  XML.out$addTag("matrix", close = FALSE, attrs = c(id = id))
  header <- as.character(colnames(mat.in))
  header <- paste(header, collapse = "\t")
  header <- c("\t", header, "\t")
  XML.out$addTag("header", header)

  for (i in 1:nrow(mat.in)) {
    vector2XML(mat.in[i, ], rownames(mat.in)[i], XML.out)
  }
  XML.out$closeTag()  # matrix
}

# create classification data in XML format
GM.data2XML <- function(data, grouping) {

  # Error detection
  if ((ncol(data)%%2) != 0)
    stop("number of columns should be even")
  p <- ncol(data)/2
  k <- 2

  # create coordinates names used by IdentiFly
  xyNames <- c("x1", "y1")
  for (i in 2:p) {
    xyNames <- c(xyNames, paste0("x", i))
    xyNames <- c(xyNames, paste0("y", i))
  }

  ## Alignment of landmark configurations

  # Convert 2D array into a 3D array
  data.3D <- arrayspecs(data, p, k)
  # Align the coordinates using Generalized Procrustes Analysis
  GPA <- gpagen(data.3D, print.progress = FALSE)
  # Convert 3D array into a 2D array - opposite to arrayspecs
  data <- two.d.array(GPA$coords)
  colnames(data) <- xyNames

  XML <- xmlOutputDOM("identifly", attrs = c(version = "1.8"))
  # XML$addTag('prototype', close=TRUE, attrs =
  # c(file='apis-worker-prototype.dw.png'))

  XML$addTag("lda", close = FALSE)

  reference <- colMeans(data)
  reference <- as.matrix(t(reference))

  colnames(reference) <- xyNames
  rownames(reference) <- c("reference")
  matrix2XML(reference, "reference", XML)

  means <- aggregate(data, by = list(grouping), FUN = mean)
  rownames(means) <- means$Group.1
  means <- subset(means, select = -c(Group.1))
  colnames(means) <- xyNames
  matrix2XML(means, "means", XML)

  PCA <- prcomp(data[, xyNames], center = TRUE, scale. = FALSE)
  data <- as.matrix(data) %*% PCA$rotation
  data <- data[, 1:(2 * p - 4)]

  n <- length(unique(grouping))  # number of groups
  LDA <- lda(data, grouping, prior = rep(1/n, n))

  XML$addTag("covariances", close = FALSE)

  scores <- as.matrix(data) %*% LDA$scaling
  scores <- as.data.frame(scores)
  covariances <- lapply(unique(grouping), function(x) cov(scores[grouping == x,
    ]))
  groups <- rownames(means)
  for (i in 1:length(groups)) {
    matrix2XML(covariances[[i]], groups[i], XML)
  }
  XML$closeTag()  # covariances

  # coefficients = t(LDA$scaling)
  coefficients <- t(PCA$rotation[, 1:(2 * p - 4)] %*% LDA$scaling)
  colnames(coefficients) <- xyNames
  matrix2XML(coefficients, "coefficients", XML)

  XML$closeTag()  # lda

  return(XML)
}

# read classification data from XML file
XML2GM.id.data <- function(path) {
  # add error checking
  XML <- read_xml(path)

  # extract lda
  lda.XML <- xml_find_first(XML, ".//lda")
  matrices <- xml_find_all(lda.XML, ".//matrix")

  # extract reference
  reference.XML <- matrices[xml_attr(matrices, "id") == "reference"]
  reference.mat <- XML2matrix(reference.XML)

  # extract means
  means.XML <- matrices[xml_attr(matrices, "id") == "means"]
  means.mat <- XML2matrix(means.XML)

  # extract covariances
  row.names <- rownames(means.mat)
  # create list of empty dataframes
  covariances <- list(rep(matrix(), length(row.names)))
  for (i in 1:length(row.names)) {
    cov.XML <- matrices[xml_attr(matrices, "id") == row.names[i]]
    cov.mat <- XML2matrix(cov.XML)
    covariances[[i]] <- cov.mat
    names(covariances)[i] <- row.names[i]
  }

  # extract coefficients
  coefficients.XML <- matrices[xml_attr(matrices, "id") == "coefficients"]
  coefficients.mat <- XML2matrix(coefficients.XML)

  return(list(reference = reference.mat, means = means.mat, covariances = covariances,
    coefficients = coefficients.mat))
}

# classify unknown samples using XML data
XML2id <- function(path, data, average = TRUE) {
  id.data <- XML2GM.id.data(path)

  # calculate means LD by matrix multiplication
  means.LD <- id.data$means %*% t(id.data$coefficients)

  if (ncol(data) != ncol(id.data$reference))
    stop("number of columns differes between reference and data")
  p <- ncol(data)/2
  k <- 2

  # Convert from 2D to 3D array
  data.3D <- arrayspecs(data, p, k)

  # convert reference from 1D to 2D
  reference <- matrix(id.data$reference, ncol = 2, byrow = TRUE)

  if (average) {
    # Align the coordinates using Generalized Procrustes Analysis
    GPA <- gpagen(data.3D, print.progress = FALSE)
    # Consensus configuration of the unknown sample
    unknown.consensus <- GPA$consensus

    # Align unknown consensus with consensus from reference samples
    unknown.OPA <- procOPA(reference, unknown.consensus)
    unknown.aligned <- unknown.OPA$Bhat

    # convert aligned data from 2D to 1D
    unknown.aligned <- matrix(t(unknown.aligned), nrow = 1)

    # calculate LD scores
    unknown.LD.sores <- unknown.aligned %*% t(id.data$coefficients)

    id <- classifyVecLD(unknown.LD.sores, means.LD, id.data$covariances)
    id <- id$class

    unknown.LD.sores <- as.data.frame(unknown.LD.sores)
    plot <- cov.plot(means.LD, id.data$covariances) + geom_point(unknown.LD.sores,
      mapping = aes(x = LD1, y = LD2, colour = "zzz")) + geom_label_repel(unknown.LD.sores,
      mapping = aes(x = LD1, y = LD2, colour = "zzz", label = "unknown"), nudge_x = 0.75,
      nudge_y = 0) + scale_color_manual(values = append(rainbow(nrow(id.data$means)),
      "black"))
  } else {
    # calculate lda scores for all wings
    LD.list <- vector(mode = "list", length = nrow(data))
    for (r in 1:nrow(data)) {
      # Align unknown consensus with consensus from reference samples
      unknown.OPA <- procOPA(reference, data.3D[, , r])
      unknown.aligned <- matrix(t(unknown.OPA$Bhat), nrow = 1)  # convert from 2D to 1D
      LD.row <- unknown.aligned %*% t(id.data$coefficients)
      LD.list[[r]] <- LD.row
    }
    LD.tab <- do.call(rbind, LD.list)
    rownames(LD.tab) <- rownames(data)
    LD.tab <- as.data.frame(LD.tab)

    id <- classifyMatLD(LD.tab, means.LD, id.data$covariances)

    plot <- cov.plot(means.LD, id.data$covariances) + geom_point(LD.tab, mapping = aes(x = LD1,
      y = LD2, colour = "zzz")) + scale_color_manual(values = append(rainbow(nrow(id.data$means)),
      "black"))
  }

  return(list(id = id, plot = plot))
}

# calculate Mahalnanobis distances for data in one vector
classifyVecLD <- function(unknown.LD, means, covariances) {
  groups <- rownames(means)
  df <- length(groups) - 1
  result.list <- list()  # empty list for results
  max.P <- 0
  max.group <- ""
  for (i in 1:length(groups)) {
    MD <- mahalanobis(unknown.LD, means[i, ], as.matrix(covariances[[i]]))
    results <- c(MD)
    P <- pchisq(MD, df = df, lower.tail = FALSE)
    results <- c(results, P)
    if (P > max.P) {
      max.P <- P
      max.group <- groups[i]
    }
    result.list[[i]] <- results
  }

  out.summary <- paste("The sample was classified as", max.group, "with probability",
    max.P)
  out.max <- data.frame(group = max.group, P = max.P)

  out.class <- do.call(rbind, result.list)
  colnames(out.class) <- c("MD2", "P")
  rownames(out.class) <- rownames(means)

  return(list(summary = out.summary, max.group = out.max, class = out.class))
}

# calculate Mahalnanobis distances for each row in a matrix
classifyMatLD <- function(unknown.LD, means, covariances) {
  result.list <- list()  # empty list for results
  for (i in 1:nrow(unknown.LD)) {
    out.row <- classifyVecLD(unknown.LD[i, ], means, covariances)
    result.list[[i]] <- out.row$max
  }
  id.results <- do.call(rbind, result.list)
  rownames(id.results) <- rownames(unknown.LD)

  return(id.results)
}

# plot ellipses for first two LD using data from covariance matrices
cov.plot <- function(means, covariances) {
  chi <- sqrt(qchisq(0.95, 2))  # Chi-Square value for probability 95% 

  groups <- rownames(means)
  ellipse.list <- list()
  for (i in 1:length(groups)) {
    x0 <- means[i, 1]
    y0 <- means[i, 2]

    cov.mat <- covariances[[i]]
    cov.mat <- cov.mat[1:2, 1:2]
    cov.eigen <- eigen(cov.mat)

    a <- chi * sqrt(cov.eigen$values[1])
    b <- chi * sqrt(cov.eigen$values[2])
    angle <- atan2(cov.eigen$vectors[2, 1], cov.eigen$vectors[1, 1])

    ellipse <- c(x0, y0, a, b, angle)
    ellipse.list[[i]] <- ellipse
  }
  ellipses <- do.call(rbind, ellipse.list)
  colnames(ellipses) <- c("LD1", "LD2", "a", "b", "angle")
  rownames(ellipses) <- groups
  ellipses <- as.data.frame(ellipses)

  ggplot(ellipses, aes(x = LD1, y = LD2, color = rownames(ellipses))) + geom_point() +
    coord_fixed() + geom_ellipse(ellipses, mapping = aes(x0 = LD1, y0 = LD2,
    a = a, b = b, angle = angle)) + geom_label_repel(aes(label = rownames(ellipses),
    size = NULL), nudge_y = 0.75) + theme(legend.position = "none")
}

Create Apis species classification data

The classification is based on tabular data obtained from Bustamante et al. (2021)

# download ESM1 from Bustamente et al. 2021 and save as
# Bustamente2021-ESM1.docx
download.file("https://static-content.springer.com/esm/art%3A10.1007%2Fs13592-021-00857-7/MediaObjects/13592_2021_857_MOESM1_ESM.docx",
  destfile = "Bustamente2021-ESM1.docx", method = "curl")

# download ESM2 from Bustamente et al. 2021 and save as
# Bustamente2021-ESM2.xlsx
download.file("https://static-content.springer.com/esm/art%3A10.1007%2Fs13592-021-00857-7/MediaObjects/13592_2021_857_MOESM2_ESM.xlsx",
  destfile = "Bustamente2021-ESM2.xlsx", method = "curl")

# download ESM3 from Bustamente et al. 2021 and save as
# Bustamente2021-ESM3.xlsx
download.file("https://static-content.springer.com/esm/art%3A10.1007%2Fs13592-021-00857-7/MediaObjects/13592_2021_857_MOESM3_ESM.xlsx",
  destfile = "Bustamente2021-ESM3.xlsx", method = "curl")

# extract reference configuration from sheet 'Aligned Sample' in ESM3
Bustamente2021.reference <- read_excel("Bustamente2021-ESM3.xlsx", sheet = "Aligned Sample",
  range = "A1:AL2")
Bustamente2021.reference <- as.data.frame(Bustamente2021.reference)
colnames(Bustamente2021.reference) <- xyNames
rownames(Bustamente2021.reference) <- c("reference")

reference.3D <- arrayspecs(Bustamente2021.reference, p, k)
# plot landmarks from Bustamante et al. (2021) the landmarks differ from Fig. 1
# (Bustamante et al. 2021) where they were flipped horizontally
plotAllSpecimens(reference.3D, links = links.Bustamente, label = TRUE, plot.param = list(pt.bg = "black",
  pt.cex = 0.5, mean.bg = "red", mean.cex = 1, link.col = "red", txt.pos = 3, txt.cex = 1))

# extract coefficients from sheet 'LD's' in ESM3
Bustamente2021.coefficients <- read_excel("Bustamente2021-ESM3.xlsx", sheet = "LD's",
  range = "A1:I39")
Bustamente2021.coefficients <- as.data.frame(Bustamente2021.coefficients)
Bustamente2021.coefficients <- Bustamente2021.coefficients[, -c(1)]
Bustamente2021.coefficients <- t(Bustamente2021.coefficients)
colnames(Bustamente2021.coefficients) <- xyNames

# # Supplementary Table S2 in Bustamente et al. 2021 contains mean landmark
# coordinates for all species Bustamente2021.means =
# read.csv('Bustamente2021-tabS2.csv') rownames(Bustamente2021.means) =
# Bustamente2021.means$species Bustamente2021.means =
# subset(Bustamente2021.means, select = - c(species))

# extract class means from ESM1; the table is in 4 parts under indexes: 5, 7,
# 10, 13
ESM1 <- read_docx("Bustamente2021-ESM1.docx")
content <- docx_summary(ESM1)

fragment <- content[content$doc_index == 5, ]
fragment <- data.frame(text = fragment$text, row_id = fragment$row_id, cell_id = fragment$cell_id)
tab5 <- reshape(fragment, idvar = "row_id", timevar = "cell_id", direction = "wide")
tab5 <- tab5[-c(1:2), -c(1)]

fragment <- content[content$doc_index == 7, ]
fragment <- data.frame(text = fragment$text, row_id = fragment$row_id, cell_id = fragment$cell_id)
tab7 <- reshape(fragment, idvar = "row_id", timevar = "cell_id", direction = "wide")
tab7 <- tab7[-c(1), -c(1)]

fragment <- content[content$doc_index == 10, ]
fragment <- data.frame(text = fragment$text, row_id = fragment$row_id, cell_id = fragment$cell_id)
tab10 <- reshape(fragment, idvar = "row_id", timevar = "cell_id", direction = "wide")
tab10 <- tab10[-c(1), -c(1)]

fragment <- content[content$doc_index == 13, ]
fragment <- data.frame(text = fragment$text, row_id = fragment$row_id, cell_id = fragment$cell_id)
tab13 <- reshape(fragment, idvar = "row_id", timevar = "cell_id", direction = "wide")
tab13 <- tab13[-c(1), -c(1)]

rownames(tab5) <- tab5[, 1]
tab5 <- tab5[, -c(1)]
Bustamente2021.means <- cbind(tab5, tab7[, 2:11], tab10[, 2:11], tab13[, 2:9])
colnames(Bustamente2021.means) <- xyNames
Bustamente2021.means[xyNames] <- sapply(Bustamente2021.means[xyNames], as.numeric)

# extract covariances matrices from ESM2; each matrix is in separate sheet
groups <- rownames(Bustamente2021.means)
# create list of empty dataframes
Bustamente2021.covariances <- c(rep(data.frame(), length(groups)))
for (i in 1:length(groups)) {
  sheet.name <- paste0(groups[i], " ginv")
  Bustamente2021.covariances[[i]] <- read_excel("Bustamente2021-ESM2.xlsx", sheet = sheet.name,
    range = "B1:I9")
  names(Bustamente2021.covariances)[i] <- groups[i]
}

# calculate linear discriminant scores by matrix multiplication
Bustamente2021.means.LD <- as.matrix(Bustamente2021.means) %*% t(as.matrix(Bustamente2021.coefficients))

cov.plot(Bustamente2021.means.LD, Bustamente2021.covariances)

# Create XML file
XML <- xmlOutputDOM("identifly", attrs = c(version = "1.8"))
XML$addTag("prototype", close = TRUE, attrs = c(file = "Bustamente2021-prototype.dw.png"))

XML$addTag("lda", close = FALSE)

reference <- Bustamente2021.reference
matrix2XML(reference, "reference", XML)

means <- Bustamente2021.means
matrix2XML(means, "means", XML)

XML$addTag("covariances", close = FALSE)

for (i in 1:length(Bustamente2021.covariances)) {
  matrix2XML(Bustamente2021.covariances[[i]], names(Bustamente2021.covariances)[i],
    XML)
}
XML$closeTag()  # covariances

matrix2XML(Bustamente2021.coefficients, "coefficients", XML)

XML$closeTag()  # lda

# add prototype for IdentiFly software
XML$addTag("prototype", close = TRUE, attrs = c(file = "Bustamente2021-prototype.dw.png"))

saveXML(XML$value(), file = "apis-species.dw.xml", prefix = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
## [1] "apis-species.dw.xml"

Create honey bee evolutionary lineages classification data

The classification is based on raw data obtained from Nawrocka et al. (2018a) and Nawrocka et al. (2018b)

wings.lin <- read.csv("https://zenodo.org/record/7567336/files/Nawrocka_et_al2018.csv")

# extract sample classifier
wings.lin$sample = substr(wings.lin$file, 1, 10)

# average data within samples
wings.lin <- aggregate(wings.lin[xyNames], 
                              by = list(wings.lin$sample), 
                              FUN = mean)
rownames(wings.lin) <- wings.lin$Group.1
wings.lin <- subset(wings.lin, select = -c(Group.1))

# extract lineage classifier
wings.lin$lineage = substr(rownames(wings.lin), 1, 1)

XML = GM.data2XML(wings.lin[xyNames], # columns with coordinates 
                  wings.lin$lineage) # 1 column with groups

# add prototype for IdentiFly software
XML$addTag("prototype", close=TRUE, attrs = c(file="apis-worker-prototype.dw.png"))

saveXML(XML$value(), file="apis-mellifera-lineage.dw.xml", 
        prefix = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") 
## [1] "apis-mellifera-lineage.dw.xml"
# Verify the XML file 
id.data = XML2GM.id.data("apis-mellifera-lineage.dw.xml")
means.LD = id.data$means %*% t(id.data$coefficients)
cov.plot(means.LD, id.data$covariances)

Create classification data for regions in lineage C

The classification is based on raw data obtained from Oleksa et al. (2022), Oleksa et al. (2023) and Nawrocka et al. (2018b)

wings.C = read.csv("https://zenodo.org/record/7244070/files/GR-raw-coordinates.csv")
wings.C = rbind(wings.C, read.csv("https://zenodo.org/record/7244070/files/HR-raw-coordinates.csv"))
wings.C = rbind(wings.C, read.csv("https://zenodo.org/record/7244070/files/MD-raw-coordinates.csv"))
wings.C = rbind(wings.C, read.csv("https://zenodo.org/record/7244070/files/RO-raw-coordinates.csv"))
wings.C = rbind(wings.C, read.csv("https://zenodo.org/record/7244070/files/SI-raw-coordinates.csv"))
wings.C = rbind(wings.C, read.csv("https://zenodo.org/record/7244070/files/TR-raw-coordinates.csv"))

# extract classifier from file name
wings.C$sample = substr(wings.C$file, 1, 7)

# average data within samples
wings.C <- aggregate(wings.C[xyNames], 
                     by = list(wings.C$sample), 
                     FUN = mean)
rownames(wings.C) <- wings.C$Group.1
wings.C <- subset(wings.C, select = -c(Group.1))

# add Italy
data.lin <- read.csv("https://zenodo.org/record/7567336/files/Nawrocka_et_al2018-geo-data.csv")
IT = subset(wings.lin, data.lin$country == "Italy")
IT <- subset(IT, select = -c(lineage))
rownames(IT) <- gsub("C-lig", "IT", rownames(IT))
wings.C = rbind(wings.C, IT)


# extract region classifier
wings.C$country = substr(rownames(wings.C), 1, 2)
wings.C$region = ifelse(wings.C$country == "HR"
                        | wings.C$country == "SI"
                        , "HR-SI", wings.C$country)
wings.C$region = ifelse(wings.C$country == "RO"
                        | wings.C$country == "MD"
                        , "RO-MD", wings.C$region)
XML = GM.data2XML(wings.C[xyNames], # columns with coordinates  
                  wings.C$region) # 1 column with groups

# add prototype for IdentiFly software
XML$addTag("prototype", close=TRUE, attrs = c(file="apis-worker-prototype.dw.png"))

saveXML(XML$value(), file="apis-mellifera-lineage-c-regions.dw.xml", 
        prefix = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") 
## [1] "apis-mellifera-lineage-c-regions.dw.xml"
# Verify the XML file 
id.data = XML2GM.id.data("apis-mellifera-lineage-c-regions.dw.xml")
means.LD = id.data$means %*% t(id.data$coefficients)
cov.plot(means.LD, id.data$covariances)

Read landmark coordinates from India

Read raw coordinates of 19 landmarks from Zenodo (Kaur et al., 2023)

wings <- read.csv("https://zenodo.org/record/8071014/files/IN-raw-coordinates.csv")

# extract individual and sample classifier from file name
wings$ind <- substr(wings$file, 1, 14)
wings$sample <- substr(wings$file, 1, 7)
head(wings, 2)
##                      file  x1  y1  x2  y2  x3  y3  x4  y4  x5  y5  x6  y6  x7
## 1 IN-0001-000243-L.dw.png 195 169 213 166 268 246 261 192 267 108 337 250 386
## 2 IN-0001-000243-R.dw.png 180 251 193 243 272 290 231 242 214 166 333 266 392
##    y7  x8  y8  x9  y9 x10 y10 x11 y11 x12 y12 x13 y13 x14 y14 x15 y15 x16 y16
## 1 282 369 258 406 224 382 204 422 171 427 131 438  99 451 270 479 235 559 194
## 2 272 367 258 386 210 357 204 379 156 364 120 361  85 448 236 458 189 512 119
##   x17 y17 x18 y18 x19 y19            ind  sample
## 1 593 191 601 161  82 244 IN-0001-000243 IN-0001
## 2 542 101 539  72 104 369 IN-0001-000243 IN-0001
tmp <- unique(wings$ind)

Map of sampling locations in India

# Read geographic coordinates
geo.data <- read.csv("https://zenodo.org/record/8071014/files/IN-data.csv")
sample.geo.data <- aggregate(cbind(geo.data$latitude, geo.data$longitude), by = list(geo.data$sample),
  FUN = mean)
sample.geo.data <- reshape::rename(sample.geo.data, c(Group.1 = "sample", V1 = "latitude",
  V2 = "longitude"))

# Read elevation data In order to download the TIF file uncomment the two lines
# below
# download.file('https://geodata.ucdavis.edu/climate/worldclim/2_1/base/wc2.1_2.5m_elev.zip',
# 'wc2.1_2.5m_elev.zip') unzip('wc2.1_2.5m_elev.zip')
elev <- raster("E:/WorldClim/wc2.1_2.5m_elev.tif")

elev.color <- colorRampPalette(c("green", "lightgreen", "yellow", "orange", "red",
  "darkred"))

x.min <- 73
x.max <- 77
y.min <- 32
y.max <- 35
e <- extent(x.min - 1, x.max + 1, y.min - 1, y.max + 1)
elev <- crop(elev, e)
elev.df <- data.frame(rasterToPoints(elev))
colnames(elev.df) <- c("longitude", "latitude", "altitude")

world <- ne_countries(scale = "medium", returnclass = "sf")

ggplot(data = world) + geom_raster(data = elev.df, aes(longitude, latitude, fill = altitude)) +
  scale_fill_gradientn(colours = elev.color(100)) + geom_sf(fill = NA) + geom_point(data = sample.geo.data,
  aes(x = longitude, y = latitude), size = 2) + coord_sf(xlim = c(x.min, x.max),
  ylim = c(y.min, y.max)) + annotation_scale(location = "bl", width_hint = 0.2)

ggplot(data = world) + geom_sf() + annotate("rect", xmin = x.min, xmax = x.max, ymin = y.min,
  ymax = y.max, alpha = 0, color = "red") + coord_sf(xlim = c(65, 100), ylim = c(5,
  40))

GPA-alignment

# Convert from 2D array to 3D array
wings.coords <- arrayspecs(wings[xyNames], p, k)
dimnames(wings.coords)[[3]] <- wings$file

# Align the coordinates using Generalized Procrustes Analysis
GPA.IN <- gpagen(wings.coords, print.progress = FALSE)
consensus.IN <- GPA.IN$consensus

# plot landmarks after alignment
plotAllSpecimens(GPA.IN$coords, links = links.apis, label = TRUE, plot.param = list(pt.bg = "black",
  pt.cex = 0.5, mean.bg = "red", mean.cex = 1, link.col = "red", txt.pos = 3, txt.cex = 1))

# Convert from 3D array to 2D array
wings.aligned.IN <- two.d.array(GPA.IN$coords)
colnames(wings.aligned.IN) <- xyNames

# Average aligned coordinates within individuals
aligned.ind.IN <- aggregate(wings.aligned.IN, by = list(wings$ind), FUN = mean)
rownames(aligned.ind.IN) <- aligned.ind.IN$Group.1
aligned.ind.IN <- subset(aligned.ind.IN, select = -c(Group.1))

# Average aligned coordinates within samples
sample.aligned <- aggregate(wings.aligned.IN, by = list(wings$sample), FUN = mean)
rownames(sample.aligned) <- sample.aligned$Group.1
sample.aligned <- subset(sample.aligned, select = -c(Group.1))

Principal component analysis of the workers and samples from India

# PCA based on individuals
ind.pca <- prcomp(aligned.ind.IN[, xyNames], center = TRUE, scale. = TRUE)
ind.pca.scores <- as.data.frame(ind.pca$x)

# create plot labels
variance.tab <- summary(ind.pca)$importance
variance <- variance.tab["Proportion of Variance", "PC1"]
variance <- round(100 * variance, 1)
label.x <- paste0("PC1 (", variance, "%)")
variance <- variance.tab["Proportion of Variance", "PC2"]
variance <- round(100 * variance, 1)
label.y <- paste0("PC2 (", variance, "%)")

ggplot(ind.pca.scores, aes(x = PC1, y = PC2)) + geom_point() + scale_shape_manual(name = "lineage",
  values = c(0:2, 15:25)) + scale_color_manual(name = "lineage", values = rainbow(13)) +
  stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + stat_ellipse()
  stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + +
  stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + #
  stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + geom_label_repel(aes(label
  stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + =
  stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + rownames(sample.aligned)))
  stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + +
xlab(label.x) + ylab(label.y)

# PCA based on samples
sample.pca <- prcomp(sample.aligned[, xyNames], center = TRUE, scale. = TRUE)
sample.pca.scores <- as.data.frame(sample.pca$x)

# create plot labels
variance.tab <- summary(sample.pca)$importance
variance <- variance.tab["Proportion of Variance", "PC1"]
variance <- round(100 * variance, 1)
label.x <- paste0("PC1 (", variance, "%)")
variance <- variance.tab["Proportion of Variance", "PC2"]
variance <- round(100 * variance, 1)
label.y <- paste0("PC2 (", variance, "%)")

ggplot(sample.pca.scores, aes(x = PC1, y = PC2)) + geom_point() + scale_shape_manual(name = "lineage",
  values = c(0:2, 15:25)) + scale_color_manual(name = "lineage", values = rainbow(13)) +
  stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + stat_ellipse()
  stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + +
  stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + #
  stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + geom_label_repel(aes(label
  stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + =
  stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + rownames(sample.aligned)))
  stat_ellipse() + # geom_label_repel(aes(label = rownames(sample.aligned))) + +
xlab(label.x) + ylab(label.y)

Classification of workers from India as species from genus Apis

# configuration of points differ between Bustamente et al. 2018 and Nawrocka et
# al. 2018 reorder input data to be compatible with classification data use
# function defined above
order.Bustamente <- c(18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 4, 3, 5, 2,
  1, 19)
sample.unknown <- reorder(as.matrix(aligned.ind.IN), order.Bustamente)

id.data <- XML2id("apis-species.dw.xml", sample.unknown, average = FALSE)
id.data$plot

out.class <- as.data.frame(id.data$id)
# All workers were classified as Apis mellifera
table(out.class$group)
## 
## mellifera 
##       175

Classification of samples from India as species from genus Apis

sample.unknown <- reorder(as.matrix(sample.aligned), order.Bustamente)

id.data <- XML2id("apis-species.dw.xml", sample.unknown, average = FALSE)
id.data$plot

id.data$id
##             group      P
## IN-0001 mellifera 0.2583
## IN-0002 mellifera 0.3717
## IN-0003 mellifera 0.3898
## IN-0004 mellifera 0.6165
## IN-0005 mellifera 0.5427
## IN-0006 mellifera 0.1350
## IN-0007 mellifera 0.2849
## IN-0008 mellifera 0.3883
## IN-0009 mellifera 0.2527
## IN-0010 mellifera 0.2531

Classification of average of all samples from India as honey bee lineages

id.data <- XML2id("apis-mellifera-lineage.dw.xml", sample.aligned, average = TRUE)
id.data$plot

id.data$id
##      MD2         P
## A  38.65 2.057e-08
## C  12.91 4.837e-03
## M 128.67 1.046e-27
## O  69.96 4.355e-15

Classification of samples from India as honey bee lineages

id.data <- XML2id("apis-mellifera-lineage.dw.xml", sample.aligned, average = FALSE)
id.data$plot

id.data$id
##         group         P
## IN-0001     C 3.820e-03
## IN-0002     C 1.499e-02
## IN-0003     A 4.833e-05
## IN-0004     C 4.213e-01
## IN-0005     C 3.241e-03
## IN-0006     C 1.032e-01
## IN-0007     C 4.264e-04
## IN-0008     C 2.637e-02
## IN-0009     C 1.351e-05
## IN-0010     C 2.443e-05

Classification of average of all samples from India as regions in lineage C

id.data <- XML2id("apis-mellifera-lineage-c-regions.dw.xml", sample.aligned, average = TRUE)
id.data$plot

id.data$id
##          MD2         P
## GR    22.743 1.425e-04
## HR-SI  8.691 6.930e-02
## IT    54.513 4.109e-11
## RO-MD 12.432 1.441e-02
## TR    69.199 3.350e-14

Classification of samples from India as regions in lineage C

id.data <- XML2id("apis-mellifera-lineage-c-regions.dw.xml", sample.aligned, average = FALSE)
id.data$plot

id.data$id
##         group        P
## IN-0001 RO-MD 0.038065
## IN-0002 HR-SI 0.124295
## IN-0003    GR 0.020656
## IN-0004 HR-SI 0.499434
## IN-0005 HR-SI 0.086676
## IN-0006 HR-SI 0.006802
## IN-0007 HR-SI 0.039837
## IN-0008 HR-SI 0.019549
## IN-0009 HR-SI 0.026855
## IN-0010 RO-MD 0.020600

References

Bustamante, T., Fuchs, S., Grünewald, B., & Ellis, J. D. (2021). A geometric morphometric method and web application for identifying honey bee species (Apis spp.) using only forewings. Apidologie, 52(3), 697-706. https://doi.org/10.1007/s13592-021-00857-7

Kaur H., Ganie S. A., & Tofilski A. (2023). Fore wings of honey bee (Apis mellifera) from Jammu and Kashmir, India [Data set]. Zenodo. https://doi.org/10.5281/zenodo.8071014

Nawrocka, A., Kandemir, İ., Fuchs, S., & Tofilski, A. (2018a). Computer software for identification of honey bee subspecies and evolutionary lineages. Apidologie, 49(2), 172-184. https://doi.org/10.1007/s13592-017-0538-y

Nawrocka, A., Kandemir, İ., Fuchs, S., & Tofiilski, A. (2018b). Dataset: Computer software for identification of honey bee subspecies and evolutionary lineages. Apidologie, 49, 172–184. https://doi.org/10.5281/zenodo.7567336

Oleksa, A., Căuia, E., Siceanu, A., Puškadija, Z., Kovačić, M., Pinto, M.A., Rodrigues, P.J., Hatjina, F., Charistos, L., Bouga, M., Prešern, J., Kandemir, I., Rašić, S., Kusza, S., & Tofilski, A. (2022). Collection of wing images for conservation of honey bees (Apis mellifera) biodiversity in Europe [Data set]. Zenodo. https://doi.org/10.5281/zenodo.7244070

Oleksa, A., Căuia, E., Siceanu, A., Puškadija, Z., Kovačić, M., Pinto, M. A., Rodrigues, P. J., Hatjina, F., Charistos, L., Bouga, M., Prešern, J., Kandemir, İ., Rašić, S., Kusza, S., Tofilski, A. (2023). Honey bee (Apis mellifera) wing images: a tool for identification and conservation. GigaScience, 12, giad019. https://doi.org/10.1093/gigascience/giad019

Information about session

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19043)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=Polish_Poland.1250  LC_CTYPE=Polish_Poland.1250   
## [3] LC_MONETARY=Polish_Poland.1250 LC_NUMERIC=C                  
## [5] LC_TIME=Polish_Poland.1250    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] XML_3.99-0.9        xml2_1.3.3          officer_0.6.2      
##  [4] readxl_1.4.0        ggspatial_1.1.6     raster_3.5-21      
##  [7] sp_1.5-0            rnaturalearth_0.1.0 ggrepel_0.9.1      
## [10] ggforce_0.3.3       ggplot2_3.3.6       MASS_7.3-58.1      
## [13] shapes_1.2.6        geomorph_4.0.4      Matrix_1.4-1       
## [16] rgl_0.109.6         RRPP_1.3.0         
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-149            sf_1.0-7                tools_4.0.3            
##  [4] bslib_0.4.1             utf8_1.2.2              rgdal_1.5-32           
##  [7] R6_2.5.1                KernSmooth_2.23-17      DBI_1.1.3              
## [10] colorspace_2.0-3        withr_2.5.0             rnaturalearthdata_0.1.0
## [13] tidyselect_1.2.0        rematch_1.0.1           compiler_4.0.3         
## [16] textshaping_0.3.6       cli_3.3.0               formatR_1.14           
## [19] labeling_0.4.2          sass_0.4.1              scales_1.2.1           
## [22] classInt_0.4-7          proxy_0.4-27            askpass_1.1            
## [25] systemfonts_1.0.4       stringr_1.4.0           digest_0.6.29          
## [28] rmarkdown_2.18          base64enc_0.1-3         jpeg_0.1-9             
## [31] pkgconfig_2.0.3         htmltools_0.5.3         fastmap_1.1.0          
## [34] highr_0.9               htmlwidgets_1.5.4       rlang_1.0.4            
## [37] rstudioapi_0.14         jquerylib_0.1.4         farver_2.1.1           
## [40] generics_0.1.3          jsonlite_1.8.0          dplyr_1.0.9            
## [43] zip_2.2.0               magrittr_2.0.1          s2_1.1.2               
## [46] Rcpp_1.0.9              munsell_0.5.0           fansi_1.0.3            
## [49] ape_5.4-1               lifecycle_1.0.1         terra_1.6-7            
## [52] scatterplot3d_0.3-42    stringi_1.7.8           yaml_2.3.5             
## [55] plyr_1.8.7              grid_4.0.3              parallel_4.0.3         
## [58] lattice_0.20-41         knitr_1.40              pillar_1.8.1           
## [61] uuid_1.1-0              codetools_0.2-16        wk_0.6.0               
## [64] glue_1.6.2              evaluate_0.18           vctrs_0.4.1            
## [67] tweenr_1.0.2            cellranger_1.1.0        gtable_0.3.1           
## [70] openssl_2.0.0           polyclip_1.10-0         reshape_0.8.9          
## [73] assertthat_0.2.1        cachem_1.0.6            xfun_0.31              
## [76] e1071_1.7-11            ragg_1.2.2              class_7.3-17           
## [79] minpack.lm_1.2-2        tibble_3.1.8            units_0.8-0