diff --git a/.gitignore b/.gitignore index a52f149a8c1c9d8d5a8e5e5457417a93d2555c7c..34469bcad70bdcd892e0d18a74f279dc7252d88a 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ .RData .Ruserdata *.Rproj -.env \ No newline at end of file +.env +data/ \ No newline at end of file diff --git a/fonctions/fct_check_columns.R b/fonctions/fct_check_columns.R new file mode 100644 index 0000000000000000000000000000000000000000..5ba502ba82a1cb0f7d2f3b619a29a625443cdd11 --- /dev/null +++ b/fonctions/fct_check_columns.R @@ -0,0 +1,31 @@ +#' Title +#' +#' @param df +#' @param vec_colnames +#' +#' @return +#' @export +#' +#' @examples +check_column <- function(df, vec_colnames = c("observation_id", "session_date")){ + + missing_columns = c() + #---- Check presence of mandatory columns + # We check for each name in vec_colnames whether or not it appears in the + # column names of the input dataframe. If not, it is stored in missing_columns + for (i in length(vec_colnames)) { + if (!(vec_colnames[i] %in% colnames(df)) ) { + missing_columns = c(missing_columns, vec_colnames[i]) + } + } + + if (length(missing_columns)==0) { + cat(sprintf("No required columns missing.\n")) + return(TRUE) + } else { + cat(sprintf(paste0("Missing column(s): ", + paste(missing_columns, collapse = " / "), "\n"))) + return(FALSE) + } + +} diff --git a/fonctions/fct_description.R b/fonctions/fct_description.R new file mode 100644 index 0000000000000000000000000000000000000000..2a9bce9907f30b7443acdbf92bb041fa6a20ab73 --- /dev/null +++ b/fonctions/fct_description.R @@ -0,0 +1,48 @@ +#' Species repartition in the data +#' +#' @param data a data.frame with the Vigie-Nature export format +#' @param index_type choose the index to calculate. It can take the followinf values : 'frequence', 'sum_presence', 'mean_taxon_count', 'sum_taxon_count' +#' +#' @returns +#' @import data.table +#' +#' @examples +#' +#' calculate_species_occurences(df) +#' calculate_species_occurences(df, index_type = "sum_taxon_count") +#' +calculate_species_occurences <- function(data, index_type = "frequence"){ + + freq_only <- typeof(data_export$taxon_count) == "character" + + # common errors + if ((!"taxon_count" %in% colnames(data) | freq_only) & index_type != "frequence") + stop("index_type can only be a frequence if taxon_count is absent from data, please use 'frequence' or 'sum_presence' as index_type or add a taxon_count variable") + if (!index_type %in% c("frequence", "sum_presence", "mean_taxon_count", "sum_taxon_count")) + stop("index_type can only take one of these values: 'frequence', 'sum_presence', 'mean_taxon_count', 'sum_taxon_count'") + + # calculate indices relative to the species + if(index_type %in% c("frequence", "sum_presence")){ + if ("taxon_count" %in% colnames(data)) { + res <- data[, .(indice = sum(taxon_count > 0)), by = taxon] + } else { + # remove potential duplicates for frequencies for exemple "mouche difficile à determiner" in spipoll + data <- unique(data[,.(session_id, taxon)]) + res <- data[, .(indice = .N), by = taxon] + } + } else { + res <- data[, .(indice = sum(taxon_count)), by = taxon] + } + + + if (index_type %in% c("frequence", "mean_taxon_count")){ + res <- res[, indice := indice/unlist(stats_globales(data))] + } + data.table::setnames(res, "indice", index_type) + return(res) +} + + +graph_species_occurences <- function(data){ + +} \ No newline at end of file diff --git a/fonctions/fct_import_database.R b/fonctions/fct_import_database.R index 530a327257b2c6caf68167b0cf21408fafb5e935..53a3f945904a9f6279c270e4968beedeff595155 100644 --- a/fonctions/fct_import_database.R +++ b/fonctions/fct_import_database.R @@ -6,7 +6,6 @@ readRenviron(".env") #' @param filepath the path of the query as sql file #' #' @return -#' @export #' #' @examples #' @@ -75,6 +74,9 @@ import_from_vne <- function (query){ #' @export #' #' @examples +#' +#' +#' import_from_mosaic <- function(query, database_name, force_UTF8 = FALSE){ library(RMySQL) @@ -89,15 +91,15 @@ import_from_mosaic <- function(query, database_name, force_UTF8 = FALSE){ dbname = database_name, host = db_host, port = db_port) raw_query_result <- dbSendQuery(mydb, query) - query_result <- fetch(raw_query_result, n = -1) + query_result <- DBI::fetch(raw_query_result, n = -1) # Force UTF8 encoding if column is char if(force_UTF8) { - query_result <- query_result %>% - mutate_if(is.character, + query_result <- query_result |> + dplyr::mutate_if(is.character, function(x) {Encoding(x) <- "UTF-8" return(x) })} - on.exit(dbDisconnect(mydb)) + on.exit(DBI::dbDisconnect(mydb)) return(query_result) } diff --git a/fonctions/stats_globales.R b/fonctions/fct_participation.R similarity index 95% rename from fonctions/stats_globales.R rename to fonctions/fct_participation.R index 0e835ad99fdb91dd6cebdfb7e1c72ca230846721..604048df485ad18c6c833bda4938de9ce3dee58b 100644 --- a/fonctions/stats_globales.R +++ b/fonctions/fct_participation.R @@ -53,7 +53,7 @@ stats_globales <- function(df, count = "sessions", selectAnnee = "all", selectDe # if no table function with group_by if (!"table" %in% c(selectAnnee, selectDepartement)){ result <- df |> - dplyr::select_at(dplyr::all_of(variable_to_count)) |> + dplyr::select(dplyr::all_of(variable_to_count)) |> dplyr::distinct()|> tidyr::drop_na() |> dplyr::summarise(nombre = dplyr::n()) @@ -61,7 +61,7 @@ stats_globales <- function(df, count = "sessions", selectAnnee = "all", selectDe result } else { result <- df |> - dplyr::group_by_at(dplyr::all_of(group)) |> + dplyr::group_by(dplyr::across(dplyr::all_of(group))) |> dplyr::select(all_of(variable_to_count)) |> dplyr::distinct()|> tidyr::drop_na() |> diff --git a/fonctions/fct_time_series.R b/fonctions/fct_time_series.R index acc93f807cd63bf10a41bb0eaf094dc7ce5bdd09..6aadcd6bfd001b2d2f6fa793750f32c156c87d9a 100644 --- a/fonctions/fct_time_series.R +++ b/fonctions/fct_time_series.R @@ -21,34 +21,49 @@ #' barplot_time_series(df = df_test, x = "x1", y = "y1", fill = "fill1", #' fix_ratio = TRUE, c_fix = 0.5) #' -barplot_time_series <- function(df, x = "year", y = NA, color_bar = NA, +barplot_time_series <- function(df, x = "session_year", y = NA, color_bar = NA, fill = "column_for_colour", lab_title = "lab_for_colour", xlab = "Years", ylab = "y_title", position = "stack", - scale_fill = c('#009ef8', '#76ea02'), + values_fill = c(), breaks_fill = c(), modif_x_axis = FALSE, fix_ratio = FALSE, c_fix = 0.15){ + # Graphique initial en barplot if (is.na(y)) { + ## Avec comptage automatique pour l'axe y selon l'axe x gg <- ggplot(df, aes(x = !!sym(x), fill = !!sym(fill))) stat = "count" }else{ + ## Avec renseignement de l'axe y en plus de l'axe x gg <- ggplot(df, aes(x = !!sym(x), y = !!sym(y), fill = !!sym(fill))) stat = "identity" } + # Création du geom_bar avec titres sur les axes x, y et la légende gg <- gg + geom_bar(stat = stat, position = position, color = color_bar)+ - scale_fill_manual(values=scale_fill)+ ylab(ylab) + xlab(xlab) + labs(fill = lab_title) + theme_cowplot() + # Modifications esthétiques du graphique + + ## Modification des couleurs et de l'ordre d'attribution des couleurs + if (length(values_fill) > 0) { + if (length(breaks_fill) > 0) { + gg <- gg + scale_fill_manual(values = values_fill, breaks = breaks_fill) + }else{ + gg <- gg + scale_fill_manual(values = values_fill) + } + } + ## Ajustement du texte de l'axe x if (modif_x_axis) { gg <- gg + theme(axis.text.x = element_text(angle = 45, size = 10, hjust = 1)) } + ## Réajustement du ratio hauteur / longueur du graphique if (fix_ratio) { gg <- gg + coord_fixed(ratio = c_fix) diff --git a/fonctions/library.R b/fonctions/library.R new file mode 100644 index 0000000000000000000000000000000000000000..874f0fc584cdb8b759593e7a867540525c5c9ff3 --- /dev/null +++ b/fonctions/library.R @@ -0,0 +1,6 @@ +suppressWarnings(suppressMessages(library(cowplot))) +suppressWarnings(suppressMessages(library(dplyr))) +suppressWarnings(suppressMessages(library(ggplot2))) +suppressWarnings(suppressMessages(library(RMySQL))) +suppressWarnings(suppressMessages(library(RPostgreSQL))) +suppressWarnings(suppressMessages(library(stringr))) \ No newline at end of file diff --git a/sample_pipeline.R b/sample_pipeline.R index 1dad83fc4f2ea0f9a8e7ae7fdaf42d86125cc4a7..9378f49322f8bfa353b2c4acbd9692659761b033 100644 --- a/sample_pipeline.R +++ b/sample_pipeline.R @@ -1,5 +1,65 @@ -# Petite indications à droite à gauche -On va tester des conflits +# Example of an easy summary pipeline +# get a graph of the evolution participation +# get a graph of the species frequence -# Lalalalalala \ No newline at end of file + +source("fonctions/library.R") + +source("fonctions/fct_check_columns.R") +source("fonctions/fct_ftp.R") +source("fonctions/fct_participation.R") +source("fonctions/fct_time_series.R") +source("fonctions/fct_description.R") + +# First step: Import data ---- +download_from_ftp(file_folder_server = "/Vigie-Nature/",file_to_download = "export_vne_vdt.csv", destination_folder = "data/") + + +# load data +data_export <- data.table::fread("data/export_vne_vdt.csv") + + +# Second step: get participation information ---- +stats_globales(data_export, count = "sessions", selectAnnee = "table") + +# Third step : graph +## - Verification of required columns +required_columns = c("session_id", "session_date") +## - the graph is displayed only if the columns are present +if (check_column(df = data_export, vec_colnames = required_columns)) { + ## - Data frame modification : + ## -> creation of session_year + ## -> creation of session_season + ## -> keeping only columns of interest + ## -> delete duplicated lines + data_graphic = data_export %>% + dplyr::mutate(session_year = strftime(session_date, "%Y"), + session_month = strftime(session_date, "%m"), + session_season = dplyr::case_when( + session_month %in% c("01", "02", "03") ~ "Winter", + session_month %in% c("04", "05", "06") ~ "Spring", + session_month %in% c("07", "08", "09") ~ "Summer", + session_month %in% c("10", "11", "12") ~ "Autumn")) %>% + select(session_id, session_year, session_season) %>% + unique() + + ## - Display graph + barplot_time_series(df = data_graphic, x = "session_year", fill = "session_season", + lab_title = "Saison", ylab = "Nombre de sessions", + xlab = "Années", modif_x_axis = T, + values_fill = c("#DA7422", "#B8D4E3", "#9BC53D", "#FDE74C"), + breaks_fill = c("Autumn", "Winter", "Spring", "Summer") ) +} + + + + +# Third step: get species repartitions ---- +species_repartition <- calculate_species_occurences(data_export) + + + +# remove the file from disk because it was a sample +file.remove("data/export_spipoll.csv") +