Lander Analytics
ChatGPT
You are an experienced R programmer. Write a function whose input is a path to a folder and the output is a count of each of the different file types in the folder.
fileTypeCounts <- function(dirPath) {
# Ensure the directory exists
if(!dir.exists(dirPath)) {
stop("Directory not found")
}
# List all files in the directory
fileList <- list.files(dirPath, full.names = TRUE)
# Get file extensions
fileExt <- tools::file_ext(fileList)
# Create a table (frequency count) of file extensions
fileTypeCounts <- table(fileExt)
return(fileTypeCounts)
}ChatGPT
Can you rebuild it with an argument that let’s us specify which filetypes to consider?
fileTypeCounts <- function(dirPath, fileTypes = NULL) {
# Ensure the directory exists
if(!dir.exists(dirPath)) {
stop("Directory not found")
}
# List all files in the directory
fileList <- list.files(dirPath, full.names = TRUE)
# Get file extensions
fileExt <- tools::file_ext(fileList)
# If specific file types are provided, only keep those
if (!is.null(fileTypes)) {
fileExt <- fileExt[fileExt %in% fileTypes]
}
# Create a table (frequency count) of file extensions
fileTypeCounts <- table(fileExt)
return(fileTypeCounts)
}ChatGPT
Can you rewrite it to use the tidyverse?
library(purrr)
library(dplyr)
library(fs)
fileTypeCounts <- function(dirPath, fileTypes = NULL) {
# Ensure the directory exists
if(!dir.exists(dirPath)) {
stop("Directory not found")
}
# List all files in the directory
fileList <- fs::dir_ls(dirPath, glob = "*")
# Get file extensions
fileExt <- purrr::map_chr(fileList, ~ fs::path_ext(.x))
# If specific file types are provided, only keep those
if (!is.null(fileTypes)) {
fileExt <- fileExt[fileExt %in% fileTypes]
}
# Create a frequency count of file extensions
fileTypeCounts <- fileExt %>%
dplyr::as_tibble() %>%
dplyr::group_by(value) %>%
dplyr::tally(sort = TRUE) %>%
dplyr::rename(File_Type = value, Count = n)
return(fileTypeCounts)
}ChatGPT
If you are going to use package::function() then you don’t need to load the packages at the beginning.
fileTypeCounts <- function(dirPath, fileTypes = NULL) {
# Ensure the directory exists
if(!fs::dir_exists(dirPath)) {
stop("Directory not found")
}
# List all files in the directory
fileList <- fs::dir_ls(dirPath, glob = "*")
# Get file extensions
fileExt <- purrr::map_chr(fileList, ~ fs::path_ext(.x))
# If specific file types are provided, only keep those
if (!is.null(fileTypes)) {
fileExt <- fileExt[fileExt %in% fileTypes]
}
# Create a frequency count of file extensions
fileTypeCounts <- fileExt %>%
tibble::as_tibble() %>%
dplyr::group_by(value) %>%
dplyr::tally(sort = TRUE) %>%
dplyr::rename(File_Type = value, Count = n)
return(fileTypeCounts)
}You're absolutely correct. If we're specifying the package name each time we call a function from it (as in package::function()), we don't actually need to load the packages with library(). This can be especially useful if we want to avoid namespace conflicts between packages.
This version has the advantage of being more explicit about where each function comes from, which can be helpful in understanding the code and avoiding conflicts.
ChatGPT
I noticed you are listing all files then only keeping the filetypes we want, if we specified any. Can you change it so that when listing files we only search for the specified filetypes, if any?
fileTypeCounts <- function(dirPath, fileTypes = NULL) {
# Ensure the directory exists
if(!fs::dir_exists(dirPath)) {
stop("Directory not found")
}
# If specific file types are provided, modify the glob pattern accordingly
if (!is.null(fileTypes)) {
glob_pattern <- paste0("*.", fileTypes)
} else {
glob_pattern <- "*"
}
# List files in the directory according to the glob pattern
fileList <- purrr::map(glob_pattern, ~ fs::dir_ls(dirPath, glob = .x)) %>% unlist()
# Get file extensions
fileExt <- purrr::map_chr(fileList, ~ fs::path_ext(.x))
# Create a frequency count of file extensions
fileTypeCounts <- fileExt %>%
tibble::as_tibble() %>%
dplyr::group_by(value) %>%
dplyr::tally(sort = TRUE) %>%
dplyr::rename(File_Type = value, Count = n)
return(fileTypeCounts)
}ChatGPT
Instead of using purrr::map() to apply fs::dir_ls() to each element in glob_pattern, instead make a proper regex for the filetypes to provide to glob_pattern.
fileTypeCounts <- function(dirPath, fileTypes = NULL) {
# Ensure the directory exists
if(!fs::dir_exists(dirPath)) {
stop("Directory not found")
}
# If specific file types are provided, modify the glob pattern accordingly
if (!is.null(fileTypes)) {
glob_pattern <- paste0("*.{", paste(fileTypes, collapse = ","), "}")
} else {
glob_pattern <- "*"
}
# List files in the directory according to the glob pattern
fileList <- fs::dir_ls(dirPath, glob = glob_pattern)
# Get file extensions
fileExt <- purrr::map_chr(fileList, ~ fs::path_ext(.x))
# Create a frequency count of file extensions
fileTypeCounts <- fileExt %>%
tibble::as_tibble() %>%
dplyr::group_by(value) %>%
dplyr::tally(sort = TRUE) %>%
dplyr::rename(File_Type = value, Count = n)
return(fileTypeCounts)
}You're right, creating a regex pattern to match the specified file types can be a more elegant solution. The function fs::dir_ls() can accept a regex pattern in its glob argument.
ChatGPT
That returns 0 results. I think the glob.pattern is wrong and that you should use the regex argument, not the glob argument.
fileTypeCounts <- function(dirPath, fileTypes = NULL) {
# Ensure the directory exists
if(!fs::dir_exists(dirPath)) {
stop("Directory not found")
}
# If specific file types are provided, modify the regex pattern accordingly
if (!is.null(fileTypes)) {
regex_pattern <- paste0(".*\\.(", paste(fileTypes, collapse = "|"), ")$")
} else {
regex_pattern <- ".*" # Matches any file
}
# List files in the directory according to the regex pattern
fileList <- fs::dir_ls(dirPath, regexp = regex_pattern)
# Get file extensions
fileExt <- purrr::map_chr(fileList, ~ fs::path_ext(.x))
# Create a frequency count of file extensions
fileTypeCounts <- fileExt %>%
tibble::as_tibble() %>%
dplyr::group_by(value) %>%
dplyr::tally(sort = TRUE) %>%
dplyr::rename(File_Type = value, Count = n)
return(fileTypeCounts)
}ChatGPT
Can you write the roxygen documentation for this function?
#' Count File Types in Directory
#'
#' This function counts the number of each file type in a specified directory.
#' The file type is determined by the file extension. If specific file types are
#' provided, the function only counts files of those types.
#'
#' @param dirPath Character string specifying the path to the directory.
#' @param fileTypes Optional character vector specifying which file types to consider.
#' If NULL (the default), all file types are considered.
#'
#' @return A tibble with one row for each file type and columns for the file type and count.
#' The counts are sorted in descending order.
#'
#' @examples
#' \dontrun{
#' fileTypeCounts("/path/to/folder", c("txt", "csv"))
#' }
#'
#' @export
fileTypeCounts <- function(dirPath, fileTypes = NULL) {
# Ensure the directory exists
if(!fs::dir_exists(dirPath)) {
stop("Directory not found")
}
# If specific file types are provided, modify the regex pattern accordingly
if (!is.null(fileTypes)) {
regex_pattern <- paste0(".*\\.(", paste(fileTypes, collapse = "|"), ")$")
} else {
regex_pattern <- ".*" # Matches any file
}
# List files in the directory according to the regex pattern
fileList <- fs::dir_ls(dirPath, regexp = regex_pattern)
# Get file extensions
fileExt <- purrr::map_chr(fileList, ~ fs::path_ext(.x))
# Create a frequency count of file extensions
fileTypeCounts <- fileExt %>%
tibble::as_tibble() %>%
dplyr::group_by(value) %>%
dplyr::tally(sort = TRUE) %>%
dplyr::rename(File_Type = value, Count = n)
return(fileTypeCounts)
}The \dontrun{} around the example is used because the example includes a directory path that won't exist on the user's system. This prevents R from trying to run the example when checking the package.
ChatGPT
Make a similar function, that calculates the number of lines of each function and returns the average for each file type. For file types where the number of lines cannot be computed, that should return NA. It should have the same arguments as this function and allow us to specify certain file types. It should use the tidyverse and package::function() notation.
avgLinesPerFile <- function(dirPath, fileTypes = NULL) {
# Ensure the directory exists
if(!fs::dir_exists(dirPath)) {
stop("Directory not found")
}
# If specific file types are provided, modify the regex pattern accordingly
if (!is.null(fileTypes)) {
regex_pattern <- paste0(".*\\.(", paste(fileTypes, collapse = "|"), ")$")
} else {
regex_pattern <- ".*" # Matches any file
}
# List files in the directory according to the regex pattern
fileList <- fs::dir_ls(dirPath, regexp = regex_pattern)
# Get file extensions
fileExt <- purrr::map_chr(fileList, ~ fs::path_ext(.x))
# Calculate the number of lines for each file (returns NA for binary files)
numLines <- purrr::map_dbl(fileList, ~ tryCatch(length(readLines(.x)), error = function(e) NA))
# Combine file extensions and number of lines into a tibble
fileData <- tibble::tibble(File_Type = fileExt, Num_Lines = numLines)
# Calculate the average number of lines for each file type
avgLines <- fileData %>%
dplyr::group_by(File_Type) %>%
dplyr::summarise(Avg_Lines = mean(Num_Lines, na.rm = TRUE), .groups = "drop")
return(avgLines)
}ChatGPT
Now write a function that uses this function in order generate a bar plot of the average number of lines for each file type.
#' Bar Plot of Average Lines Per File by File Type
#'
#' This function generates a bar plot of the average number of lines for each file type in a specified directory.
#' The number of lines is only calculated for file types that are text-based (e.g., .txt, .csv, .R).
#' For binary file types, the number of lines cannot be computed and the function returns NA.
#'
#' @export
plotAvgLinesPerFile <- function(dirPath, fileTypes = NULL) {
# Calculate average lines per file by file type
avgLines <- avgLinesPerFile(dirPath, fileTypes)
# Generate the bar plot
plot <- ggplot2::ggplot(avgLines, ggplot2::aes(x = File_Type, y = Avg_Lines)) +
ggplot2::geom_bar(stat = "identity", fill = "skyblue") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) +
ggplot2::labs(x = "File Type", y = "Average Number of Lines",
title = "Average Number of Lines per File by File Type")
return(plot)
}ChatGPT
Modify the function to allow users to choose either ggplot or echarts and then make whichever version the user requested.
plotAvgLinesPerFile <- function(dirPath, fileTypes = NULL, plotLibrary = "ggplot") {
# Calculate average lines per file by file type
avgLines <- avgLinesPerFile(dirPath, fileTypes)
# Check the plot library argument
if (!plotLibrary %in% c("ggplot", "echarts")) {
stop("Invalid plot library. Choose either 'ggplot' or 'echarts'.")
}
# Generate the bar plot with the specified library
if (plotLibrary == "ggplot") {
# Generate the ggplot bar plot
plot <- ggplot2::ggplot(avgLines, ggplot2::aes(x = File_Type, y = Avg_Lines)) +
ggplot2::geom_bar(stat = "identity", fill = "skyblue") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) +
ggplot2::labs(x = "File Type", y = "Average Number of Lines",
title = "Average Number of Lines per File by File Type")
} else {
# Generate the echarts bar plot
plot <- echartr::echartr(avgLines, File_Type, Avg_Lines, "bar") %>%
echartr::setToolbox(show = TRUE) %>%
echartr::setTitle("Average Number of Lines per File by File Type") %>%
echartr::setxAxis(name = "File Type") %>%
echartr::setyAxis(name = "Average Number of Lines")
}
return(plot)
}ChatGPT
It looks like you’re trying to use echartr rather than echarts4r.
plotAvgLinesPerFile <- function(dirPath, fileTypes = NULL, plotLibrary = "ggplot") {
# Calculate average lines per file by file type
avgLines <- avgLinesPerFile(dirPath, fileTypes)
# Check the plot library argument
if (!plotLibrary %in% c("ggplot", "echarts4r")) {
stop("Invalid plot library. Choose either 'ggplot' or 'echarts4r'.")
}
# Generate the bar plot with the specified library
if (plotLibrary == "ggplot") {
# Generate the ggplot bar plot
plot <- ggplot2::ggplot(avgLines, ggplot2::aes(x = File_Type, y = Avg_Lines)) +
ggplot2::geom_bar(stat = "identity", fill = "skyblue") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) +
ggplot2::labs(x = "File Type", y = "Average Number of Lines",
title = "Average Number of Lines per File by File Type")
} else {
# Generate the echarts4r bar plot
plot <- avgLines %>%
echarts4r::e_charts(File_Type) %>%
echarts4r::e_bar(Avg_Lines) %>%
echarts4r::e_title("Average Number of Lines per File by File Type")
}
return(plot)
}ChatGPT
Instead of checking for allowed plot libraries in an if statement, make the default to the argument a vector of allowable types then check with match.arg().
plotAvgLinesPerFile <- function(dirPath, fileTypes = NULL, plotLibrary = c("ggplot", "echarts4r")) {
# Calculate average lines per file by file type
avgLines <- avgLinesPerFile(dirPath, fileTypes)
# Match the plot library argument
plotLibrary <- match.arg(plotLibrary)
# Generate the bar plot with the specified library
if (plotLibrary == "ggplot") {
# Generate the ggplot bar plot
plot <- ggplot2::ggplot(avgLines, ggplot2::aes(x = File_Type, y = Avg_Lines)) +
ggplot2::geom_bar(stat = "identity", fill = "skyblue") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) +
ggplot2::labs(x = "File Type", y = "Average Number of Lines",
title = "Average Number of Lines per File by File Type")
} else {
# Generate the echarts4r bar plot
plot <- avgLines %>%
echarts4r::e_charts(File_Type) %>%
echarts4r::e_bar(Avg_Lines) %>%
echarts4r::e_title("Average Number of Lines per File by File Type")
}
return(plot)
}ChatGPT
Rather than put all the plotting code in this function, put each type of plot in its own helper function, then have this function call one of those based on what the user specified.
#' Generate ggplot Bar Plot
#'
#' @param data Dataframe containing the average number of lines and file types.
#' @return ggplot object.
genGgplot <- function(data) {
plot <- ggplot2::ggplot(data, ggplot2::aes(x = File_Type, y = Avg_Lines)) +
ggplot2::geom_bar(stat = "identity", fill = "skyblue") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) +
ggplot2::labs(x = "File Type", y = "Average Number of Lines",
title = "Average Number of Lines per File by File Type")
return(plot)
}
#' Generate echarts4r Bar Plot
#'
#' @param data Dataframe containing the average number of lines and file types.
#' @return echarts4r plot object.
genEcharts <- function(data) {
plot <- data %>%
echarts4r::e_charts(File_Type) %>%
echarts4r::e_bar(Avg_Lines) %>%
echarts4r::e_title("Average Number of Lines per File by File Type")
return(plot)
}plotAvgLinesPerFile <- function(dirPath, fileTypes = NULL, plotLibrary = c("ggplot", "echarts4r")) {
# Calculate average lines per file by file type
avgLines <- avgLinesPerFile(dirPath, fileTypes)
# Match the plot library argument
plotLibrary <- match.arg(plotLibrary)
# Generate the bar plot with the specified library
if (plotLibrary == "ggplot") {
plot <- genGgplot(avgLines)
} else {
plot <- genEcharts(avgLines)
}
return(plot)
}ChatGPT
Almost got it, but use match.arg() here too.
plotAvgLinesPerFile <- function(dirPath, fileTypes = NULL, plotLibrary = c("ggplot", "echarts4r")) {
# Calculate average lines per file by file type
avgLines <- avgLinesPerFile(dirPath, fileTypes)
# Match the plot library argument and create a list of function calls
plotFunc <- list(
ggplot = genGgplot(avgLines),
echarts4r = genEcharts(avgLines)
)
# Generate the bar plot with the specified library
plot <- plotFunc[[match.arg(plotLibrary)]]
return(plot)
}ChatGPT
Add options for a base R plot and lattice.
#' Generate Base R Bar Plot
#'
#' @param data Dataframe containing the average number of lines and file types.
#' @return NULL. The function outputs the plot directly.
genBaseRPlot <- function(data) {
barplot(data$Avg_Lines, names.arg = data$File_Type,
main = "Average Number of Lines per File by File Type",
xlab = "File Type", ylab = "Average Number of Lines", col = "skyblue")
}
#' Generate Lattice Bar Plot
#'
#' @param data Dataframe containing the average number of lines and file types.
#' @return lattice plot object.
genLatticePlot <- function(data) {
plot <- lattice::barchart(Avg_Lines ~ File_Type, data = data,
main = "Average Number of Lines per File by File Type",
xlab = "File Type", ylab = "Average Number of Lines",
col = "skyblue")
return(plot)
}
#' Bar Plot of Average Lines Per File by File Type
#'
#' @param dirPath Character string specifying the path to the directory.
#' @param fileTypes Optional character vector specifying which file types to consider.
#' If NULL (the default), all file types are considered.
#' @param plotLibrary Character string specifying which plotting library to use.
#' Options are "ggplot", "echarts4r", "base", or "lattice". Default is c("ggplot", "echarts4r", "base", "lattice").
#' @return A plot object for ggplot, echarts4r, and lattice. Base R plots are output directly.
#'
#' @examples
#' \dontrun{
#' plotAvgLinesPerFile("/path/to/folder", c("txt", "csv", "R"), "lattice")
#' }
#'
#' @export
plotAvgLinesPerFile <- function(dirPath, fileTypes = NULL, plotLibrary = c("ggplot", "echarts4r", "base", "lattice")) {
# Calculate average lines per file by file type
avgLines <- avgLinesPerFile(dirPath, fileTypes)
# Match the plot library argument and create a list of function calls
plotFunc <- list(
ggplot = genGgplot(avgLines),
echarts4r = genEcharts(avgLines),
base = genBaseRPlot(avgLines),
lattice = genLatticePlot(avgLines)
)
# Generate the bar plot with the specified library
plot <- plotFunc[[match.arg(plotLibrary)]]
return(plot)
}## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1