Building an R Package with LLMs

Jared P. Lander

Lander Analytics

What are we Doing?

ChatGPT

You are an experienced R programmer. Write a function whose input is a path to a folder and the output is a count of each of the different file types in the folder.

fileTypeCounts <- function(dirPath) {
  # Ensure the directory exists
  if(!dir.exists(dirPath)) {
    stop("Directory not found")
  }

  # List all files in the directory
  fileList <- list.files(dirPath, full.names = TRUE)

  # Get file extensions
  fileExt <- tools::file_ext(fileList)

  # Create a table (frequency count) of file extensions
  fileTypeCounts <- table(fileExt)

  return(fileTypeCounts)
}

fileTypeCounts('example_files')
## fileExt
##  csv json  txt 
##    3    1    2

ChatGPT

Can you rebuild it with an argument that let’s us specify which filetypes to consider?

fileTypeCounts <- function(dirPath, fileTypes = NULL) {
  # Ensure the directory exists
  if(!dir.exists(dirPath)) {
    stop("Directory not found")
  }

  # List all files in the directory
  fileList <- list.files(dirPath, full.names = TRUE)

  # Get file extensions
  fileExt <- tools::file_ext(fileList)

  # If specific file types are provided, only keep those
  if (!is.null(fileTypes)) {
    fileExt <- fileExt[fileExt %in% fileTypes]
  }

  # Create a table (frequency count) of file extensions
  fileTypeCounts <- table(fileExt)

  return(fileTypeCounts)
}

fileTypeCounts('example_files')
## fileExt
##  csv json  txt 
##    3    1    2
fileTypeCounts('example_files', fileTypes=c('csv', 'json'))
## fileExt
##  csv json 
##    3    1

ChatGPT

Can you rewrite it to use the tidyverse?

library(purrr)
library(dplyr)
library(fs)

fileTypeCounts <- function(dirPath, fileTypes = NULL) {
  # Ensure the directory exists
  if(!dir.exists(dirPath)) {
    stop("Directory not found")
  }
  # List all files in the directory
  fileList <- fs::dir_ls(dirPath, glob = "*")
  # Get file extensions
  fileExt <- purrr::map_chr(fileList, ~ fs::path_ext(.x))
  # If specific file types are provided, only keep those
  if (!is.null(fileTypes)) {
    fileExt <- fileExt[fileExt %in% fileTypes]
  }
  # Create a frequency count of file extensions
  fileTypeCounts <- fileExt %>%
    dplyr::as_tibble() %>%
    dplyr::group_by(value) %>%
    dplyr::tally(sort = TRUE) %>%
    dplyr::rename(File_Type = value, Count = n)
  return(fileTypeCounts)
}

fileTypeCounts('example_files', fileTypes=c('csv', 'json'))
## # A tibble: 2 × 2
##   File_Type Count
##   <chr>     <int>
## 1 csv           3
## 2 json          1

ChatGPT

If you are going to use package::function() then you don’t need to load the packages at the beginning.

fileTypeCounts <- function(dirPath, fileTypes = NULL) {
  # Ensure the directory exists
  if(!fs::dir_exists(dirPath)) {
    stop("Directory not found")
  }
  # List all files in the directory
  fileList <- fs::dir_ls(dirPath, glob = "*")

  # Get file extensions
  fileExt <- purrr::map_chr(fileList, ~ fs::path_ext(.x))

  # If specific file types are provided, only keep those
  if (!is.null(fileTypes)) {
    fileExt <- fileExt[fileExt %in% fileTypes]
  }

  # Create a frequency count of file extensions
  fileTypeCounts <- fileExt %>%
    tibble::as_tibble() %>%
    dplyr::group_by(value) %>%
    dplyr::tally(sort = TRUE) %>%
    dplyr::rename(File_Type = value, Count = n)
    
  return(fileTypeCounts)
}

You're absolutely correct. If we're specifying the package name each time we call a function from it (as in package::function()), we don't actually need to load the packages with library(). This can be especially useful if we want to avoid namespace conflicts between packages.
This version has the advantage of being more explicit about where each function comes from, which can be helpful in understanding the code and avoiding conflicts.

ChatGPT

I noticed you are listing all files then only keeping the filetypes we want, if we specified any. Can you change it so that when listing files we only search for the specified filetypes, if any?

fileTypeCounts <- function(dirPath, fileTypes = NULL) {
  # Ensure the directory exists
  if(!fs::dir_exists(dirPath)) {
    stop("Directory not found")
  }
  # If specific file types are provided, modify the glob pattern accordingly
  if (!is.null(fileTypes)) {
    glob_pattern <- paste0("*.", fileTypes)
  } else {
    glob_pattern <- "*"
  }
  # List files in the directory according to the glob pattern
  fileList <- purrr::map(glob_pattern, ~ fs::dir_ls(dirPath, glob = .x)) %>% unlist()
  # Get file extensions
  fileExt <- purrr::map_chr(fileList, ~ fs::path_ext(.x))
  # Create a frequency count of file extensions
  fileTypeCounts <- fileExt %>%
    tibble::as_tibble() %>%
    dplyr::group_by(value) %>%
    dplyr::tally(sort = TRUE) %>%
    dplyr::rename(File_Type = value, Count = n)
  return(fileTypeCounts)
}

ChatGPT

Instead of using purrr::map() to apply fs::dir_ls() to each element in glob_pattern, instead make a proper regex for the filetypes to provide to glob_pattern.

fileTypeCounts <- function(dirPath, fileTypes = NULL) {
  # Ensure the directory exists
  if(!fs::dir_exists(dirPath)) {
    stop("Directory not found")
  }
  
  # If specific file types are provided, modify the glob pattern accordingly
  if (!is.null(fileTypes)) {
    glob_pattern <- paste0("*.{", paste(fileTypes, collapse = ","), "}")
  } else {
    glob_pattern <- "*"
  }

  # List files in the directory according to the glob pattern
  fileList <- fs::dir_ls(dirPath, glob = glob_pattern)
  
  # Get file extensions
  fileExt <- purrr::map_chr(fileList, ~ fs::path_ext(.x))
  
  # Create a frequency count of file extensions
  fileTypeCounts <- fileExt %>%
    tibble::as_tibble() %>%
    dplyr::group_by(value) %>%
    dplyr::tally(sort = TRUE) %>%
    dplyr::rename(File_Type = value, Count = n)
    
  return(fileTypeCounts)
}

You're right, creating a regex pattern to match the specified file types can be a more elegant solution. The function fs::dir_ls() can accept a regex pattern in its glob argument.

fileTypeCounts('example_files', fileTypes=c('csv', 'json'))
## # A tibble: 0 × 2
## # ℹ 2 variables: File_Type <chr>, Count <int>

ChatGPT

That returns 0 results. I think the glob.pattern is wrong and that you should use the regex argument, not the glob argument.

fileTypeCounts <- function(dirPath, fileTypes = NULL) {
  # Ensure the directory exists
  if(!fs::dir_exists(dirPath)) {
    stop("Directory not found")
  }
  
  # If specific file types are provided, modify the regex pattern accordingly
  if (!is.null(fileTypes)) {
    regex_pattern <- paste0(".*\\.(", paste(fileTypes, collapse = "|"), ")$")
  } else {
    regex_pattern <- ".*"  # Matches any file
  }

  # List files in the directory according to the regex pattern
  fileList <- fs::dir_ls(dirPath, regexp = regex_pattern)
  
  # Get file extensions
  fileExt <- purrr::map_chr(fileList, ~ fs::path_ext(.x))
  
  # Create a frequency count of file extensions
  fileTypeCounts <- fileExt %>%
    tibble::as_tibble() %>%
    dplyr::group_by(value) %>%
    dplyr::tally(sort = TRUE) %>%
    dplyr::rename(File_Type = value, Count = n)
    
  return(fileTypeCounts)
}

fileTypeCounts('example_files', fileTypes=c('csv', 'json'))
## # A tibble: 2 × 2
##   File_Type Count
##   <chr>     <int>
## 1 csv           3
## 2 json          1

ChatGPT

Can you write the roxygen documentation for this function?

#' Count File Types in Directory
#'
#' This function counts the number of each file type in a specified directory.
#' The file type is determined by the file extension. If specific file types are
#' provided, the function only counts files of those types.
#'
#' @param dirPath Character string specifying the path to the directory.
#' @param fileTypes Optional character vector specifying which file types to consider.
#'   If NULL (the default), all file types are considered.
#'
#' @return A tibble with one row for each file type and columns for the file type and count.
#'   The counts are sorted in descending order.
#'
#' @examples
#' \dontrun{
#' fileTypeCounts("/path/to/folder", c("txt", "csv"))
#' }
#'
#' @export
fileTypeCounts <- function(dirPath, fileTypes = NULL) {
  # Ensure the directory exists
  if(!fs::dir_exists(dirPath)) {
    stop("Directory not found")
  }
  
  # If specific file types are provided, modify the regex pattern accordingly
  if (!is.null(fileTypes)) {
    regex_pattern <- paste0(".*\\.(", paste(fileTypes, collapse = "|"), ")$")
  } else {
    regex_pattern <- ".*"  # Matches any file
  }

  # List files in the directory according to the regex pattern
  fileList <- fs::dir_ls(dirPath, regexp = regex_pattern)
  
  # Get file extensions
  fileExt <- purrr::map_chr(fileList, ~ fs::path_ext(.x))
  
  # Create a frequency count of file extensions
  fileTypeCounts <- fileExt %>%
    tibble::as_tibble() %>%
    dplyr::group_by(value) %>%
    dplyr::tally(sort = TRUE) %>%
    dplyr::rename(File_Type = value, Count = n)
    
  return(fileTypeCounts)
}

The \dontrun{} around the example is used because the example includes a directory path that won't exist on the user's system. This prevents R from trying to run the example when checking the package.

ChatGPT

Make a similar function, that calculates the number of lines of each function and returns the average for each file type. For file types where the number of lines cannot be computed, that should return NA. It should have the same arguments as this function and allow us to specify certain file types. It should use the tidyverse and package::function() notation.

avgLinesPerFile <- function(dirPath, fileTypes = NULL) {
  # Ensure the directory exists
  if(!fs::dir_exists(dirPath)) {
    stop("Directory not found")
  }
  
  # If specific file types are provided, modify the regex pattern accordingly
  if (!is.null(fileTypes)) {
    regex_pattern <- paste0(".*\\.(", paste(fileTypes, collapse = "|"), ")$")
  } else {
    regex_pattern <- ".*"  # Matches any file
  }

  # List files in the directory according to the regex pattern
  fileList <- fs::dir_ls(dirPath, regexp = regex_pattern)
  
  # Get file extensions
  fileExt <- purrr::map_chr(fileList, ~ fs::path_ext(.x))
  
  # Calculate the number of lines for each file (returns NA for binary files)
  numLines <- purrr::map_dbl(fileList, ~ tryCatch(length(readLines(.x)), error = function(e) NA))

  # Combine file extensions and number of lines into a tibble
  fileData <- tibble::tibble(File_Type = fileExt, Num_Lines = numLines)
  
  # Calculate the average number of lines for each file type
  avgLines <- fileData %>%
    dplyr::group_by(File_Type) %>%
    dplyr::summarise(Avg_Lines = mean(Num_Lines, na.rm = TRUE), .groups = "drop")
    
  return(avgLines)
}

avgLinesPerFile('example_files')
## # A tibble: 3 × 2
##   File_Type Avg_Lines
##   <chr>         <dbl>
## 1 csv            6.67
## 2 json          42   
## 3 txt            8
avgLinesPerFile('example_files', fileTypes=c('csv', 'json'))
## # A tibble: 2 × 2
##   File_Type Avg_Lines
##   <chr>         <dbl>
## 1 csv            6.67
## 2 json          42

ChatGPT

Now write a function that uses this function in order generate a bar plot of the average number of lines for each file type.

#' Bar Plot of Average Lines Per File by File Type
#'
#' This function generates a bar plot of the average number of lines for each file type in a specified directory.
#' The number of lines is only calculated for file types that are text-based (e.g., .txt, .csv, .R).
#' For binary file types, the number of lines cannot be computed and the function returns NA.
#'
#' @export
plotAvgLinesPerFile <- function(dirPath, fileTypes = NULL) {
  # Calculate average lines per file by file type
  avgLines <- avgLinesPerFile(dirPath, fileTypes)
  
  # Generate the bar plot
  plot <- ggplot2::ggplot(avgLines, ggplot2::aes(x = File_Type, y = Avg_Lines)) +
    ggplot2::geom_bar(stat = "identity", fill = "skyblue") +
    ggplot2::theme_minimal() +
    ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) +
    ggplot2::labs(x = "File Type", y = "Average Number of Lines", 
                  title = "Average Number of Lines per File by File Type")
  
  return(plot)
}

plotAvgLinesPerFile('example_files') |> print()

ChatGPT

Modify the function to allow users to choose either ggplot or echarts and then make whichever version the user requested.

plotAvgLinesPerFile <- function(dirPath, fileTypes = NULL, plotLibrary = "ggplot") {
  # Calculate average lines per file by file type
  avgLines <- avgLinesPerFile(dirPath, fileTypes)
  
  # Check the plot library argument
  if (!plotLibrary %in% c("ggplot", "echarts")) {
    stop("Invalid plot library. Choose either 'ggplot' or 'echarts'.")
  }
  
  # Generate the bar plot with the specified library
  if (plotLibrary == "ggplot") {
    # Generate the ggplot bar plot
    plot <- ggplot2::ggplot(avgLines, ggplot2::aes(x = File_Type, y = Avg_Lines)) +
      ggplot2::geom_bar(stat = "identity", fill = "skyblue") +
      ggplot2::theme_minimal() +
      ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) +
      ggplot2::labs(x = "File Type", y = "Average Number of Lines", 
                    title = "Average Number of Lines per File by File Type")
  } else {
    # Generate the echarts bar plot
    plot <- echartr::echartr(avgLines, File_Type, Avg_Lines, "bar") %>%
      echartr::setToolbox(show = TRUE) %>%
      echartr::setTitle("Average Number of Lines per File by File Type") %>%
      echartr::setxAxis(name = "File Type") %>%
      echartr::setyAxis(name = "Average Number of Lines")
  }
  
  return(plot)
}

ChatGPT

It looks like you’re trying to use echartr rather than echarts4r.

plotAvgLinesPerFile <- function(dirPath, fileTypes = NULL, plotLibrary = "ggplot") {
  # Calculate average lines per file by file type
  avgLines <- avgLinesPerFile(dirPath, fileTypes)
  
  # Check the plot library argument
  if (!plotLibrary %in% c("ggplot", "echarts4r")) {
    stop("Invalid plot library. Choose either 'ggplot' or 'echarts4r'.")
  }
  
  # Generate the bar plot with the specified library
  if (plotLibrary == "ggplot") {
    # Generate the ggplot bar plot
    plot <- ggplot2::ggplot(avgLines, ggplot2::aes(x = File_Type, y = Avg_Lines)) +
      ggplot2::geom_bar(stat = "identity", fill = "skyblue") +
      ggplot2::theme_minimal() +
      ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) +
      ggplot2::labs(x = "File Type", y = "Average Number of Lines", 
                    title = "Average Number of Lines per File by File Type")
  } else {
    # Generate the echarts4r bar plot
    plot <- avgLines %>%
      echarts4r::e_charts(File_Type) %>%
      echarts4r::e_bar(Avg_Lines) %>%
      echarts4r::e_title("Average Number of Lines per File by File Type")
  }
  
  return(plot)
}

plotAvgLinesPerFile('example_files', plotLibrary='echarts4r')

ChatGPT

Instead of checking for allowed plot libraries in an if statement, make the default to the argument a vector of allowable types then check with match.arg().

plotAvgLinesPerFile <- function(dirPath, fileTypes = NULL, plotLibrary = c("ggplot", "echarts4r")) {
  # Calculate average lines per file by file type
  avgLines <- avgLinesPerFile(dirPath, fileTypes)
  
  # Match the plot library argument
  plotLibrary <- match.arg(plotLibrary)
  
  # Generate the bar plot with the specified library
  if (plotLibrary == "ggplot") {
    # Generate the ggplot bar plot
    plot <- ggplot2::ggplot(avgLines, ggplot2::aes(x = File_Type, y = Avg_Lines)) +
      ggplot2::geom_bar(stat = "identity", fill = "skyblue") +
      ggplot2::theme_minimal() +
      ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) +
      ggplot2::labs(x = "File Type", y = "Average Number of Lines", 
                    title = "Average Number of Lines per File by File Type")
  } else {
    # Generate the echarts4r bar plot
    plot <- avgLines %>%
      echarts4r::e_charts(File_Type) %>%
      echarts4r::e_bar(Avg_Lines) %>%
      echarts4r::e_title("Average Number of Lines per File by File Type")
  }
  
  return(plot)
}

ChatGPT

Rather than put all the plotting code in this function, put each type of plot in its own helper function, then have this function call one of those based on what the user specified.

#' Generate ggplot Bar Plot
#'
#' @param data Dataframe containing the average number of lines and file types.
#' @return ggplot object.
genGgplot <- function(data) {
  plot <- ggplot2::ggplot(data, ggplot2::aes(x = File_Type, y = Avg_Lines)) +
    ggplot2::geom_bar(stat = "identity", fill = "skyblue") +
    ggplot2::theme_minimal() +
    ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) +
    ggplot2::labs(x = "File Type", y = "Average Number of Lines", 
                  title = "Average Number of Lines per File by File Type")
  return(plot)
}

#' Generate echarts4r Bar Plot
#'
#' @param data Dataframe containing the average number of lines and file types.
#' @return echarts4r plot object.
genEcharts <- function(data) {
  plot <- data %>%
    echarts4r::e_charts(File_Type) %>%
    echarts4r::e_bar(Avg_Lines) %>%
    echarts4r::e_title("Average Number of Lines per File by File Type")
  return(plot)
}

plotAvgLinesPerFile <- function(dirPath, fileTypes = NULL, plotLibrary = c("ggplot", "echarts4r")) {
  # Calculate average lines per file by file type
  avgLines <- avgLinesPerFile(dirPath, fileTypes)
  
  # Match the plot library argument
  plotLibrary <- match.arg(plotLibrary)
  
  # Generate the bar plot with the specified library
  if (plotLibrary == "ggplot") {
    plot <- genGgplot(avgLines)
  } else {
    plot <- genEcharts(avgLines)
  }
  
  return(plot)
}

ChatGPT

Almost got it, but use match.arg() here too.

plotAvgLinesPerFile <- function(dirPath, fileTypes = NULL, plotLibrary = c("ggplot", "echarts4r")) {
  # Calculate average lines per file by file type
  avgLines <- avgLinesPerFile(dirPath, fileTypes)
  
  # Match the plot library argument and create a list of function calls
  plotFunc <- list(
    ggplot = genGgplot(avgLines),
    echarts4r = genEcharts(avgLines)
  )
  
  # Generate the bar plot with the specified library
  plot <- plotFunc[[match.arg(plotLibrary)]]
  
  return(plot)
}

ChatGPT

Add options for a base R plot and lattice.

#' Generate Base R Bar Plot
#'
#' @param data Dataframe containing the average number of lines and file types.
#' @return NULL. The function outputs the plot directly.
genBaseRPlot <- function(data) {
  barplot(data$Avg_Lines, names.arg = data$File_Type, 
          main = "Average Number of Lines per File by File Type",
          xlab = "File Type", ylab = "Average Number of Lines", col = "skyblue")
}

#' Generate Lattice Bar Plot
#'
#' @param data Dataframe containing the average number of lines and file types.
#' @return lattice plot object.
genLatticePlot <- function(data) {
  plot <- lattice::barchart(Avg_Lines ~ File_Type, data = data,
                            main = "Average Number of Lines per File by File Type",
                            xlab = "File Type", ylab = "Average Number of Lines",
                            col = "skyblue")
  return(plot)
}

#' Bar Plot of Average Lines Per File by File Type
#'
#' @param dirPath Character string specifying the path to the directory.
#' @param fileTypes Optional character vector specifying which file types to consider.
#'   If NULL (the default), all file types are considered.
#' @param plotLibrary Character string specifying which plotting library to use.
#'   Options are "ggplot", "echarts4r", "base", or "lattice". Default is c("ggplot", "echarts4r", "base", "lattice").
#' @return A plot object for ggplot, echarts4r, and lattice. Base R plots are output directly.
#'
#' @examples
#' \dontrun{
#' plotAvgLinesPerFile("/path/to/folder", c("txt", "csv", "R"), "lattice")
#' }
#'
#' @export
plotAvgLinesPerFile <- function(dirPath, fileTypes = NULL, plotLibrary = c("ggplot", "echarts4r", "base", "lattice")) {
  # Calculate average lines per file by file type
  avgLines <- avgLinesPerFile(dirPath, fileTypes)
  
  # Match the plot library argument and create a list of function calls
  plotFunc <- list(
    ggplot = genGgplot(avgLines),
    echarts4r = genEcharts(avgLines),
    base = genBaseRPlot(avgLines),
    lattice = genLatticePlot(avgLines)
  )
  
  # Generate the bar plot with the specified library
  plot <- plotFunc[[match.arg(plotLibrary)]]
  
  return(plot)
}

plotAvgLinesPerFile('example_files', plotLibrary='base') |> print()
##      [,1]
## [1,]  0.7
## [2,]  1.9
## [3,]  3.1

plotAvgLinesPerFile('example_files', plotLibrary='lattice') |> print()