From 95243c4c714d0b70cde906529860184e6e2cdc96 Mon Sep 17 00:00:00 2001
From: Abhirupa Ghosh <100681585+AbhirupaGhosh@users.noreply.github.com>
Date: Thu, 19 Feb 2026 08:54:16 -0700
Subject: [PATCH 1/4] Split cleanData and add Parquet exports

Rename the original cleanData to cleanMetaData and add roxygen skeleton. Introduce a writeCompressedParquet helper and export cleaned metadata, AMR phenotype, genome data and original metadata to compressed Parquet files, then create a separate DuckDB (parquet-backed) with views for metadata, amr_phenotype, genome_data and original_metadata. Reintroduce a new cleanData function focused on feature matrices (genes/proteins/domains/etc.) that writes feature tables to Parquet and creates corresponding views; remove duplicated metadata parquet exports from the feature-matrix flow. Minor whitespace and path-handling adjustments to normalize paths and ensure output directories exist.
---
 R/data_processing.R | 112 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 85 insertions(+), 27 deletions(-)
diff --git a/R/data_processing.R b/R/data_processing.R
index 78ab404..c417e52 100644
--- a/R/data_processing.R
+++ b/R/data_processing.R
@@ -1223,7 +1223,16 @@ domainFromIPR <- function(duckdb_path,
 }
 
 # Clean BV-BRC metadata, then save as Parquet files
-cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){
+#'
+#' @param duckdb_path 
+#' @param path 
+#' @param ref_file_path 
+#'
+#' @returns
+#' @export
+#'
+#' @examples
+cleanMetaData <- function(duckdb_path, path, ref_file_path = "data_raw/"){
   duckdb_path  <- normalizePath(duckdb_path)
   # If no explicit path is provided (or a generic one), choose results/<bug>/ when
   # the DuckDB lives under data/<bug>/, or else fall back to the DuckDB directory.
@@ -1237,14 +1246,14 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){
     )
     path <- if (!identical(mapped_results, bug_dir)) mapped_results else bug_dir
   }
-
+  
   path <- normalizePath(path, mustWork = FALSE)
   if (!dir.exists(path)) dir.create(path, recursive = TRUE)
-
+  
   con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path)
   on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE)
   ref_file_path <- normalizePath(ref_file_path)
-
+  
   clean_drug <- readr::read_tsv(file.path(ref_file_path, "clean_drug.tsv"))
   drug_class <- readr::read_tsv(file.path(ref_file_path, "drug_class.tsv"))
   drug_abbr  <- readr::read_tsv(file.path(ref_file_path, "drug_abbr.tsv"))
@@ -1252,7 +1261,7 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){
   clean_countries <- readr::read_tsv(file.path(ref_file_path, "cleaned_bvbrc_countries.tsv")) |>
     dplyr::select("raw_entry", "clean_name", "short_name")|>
     dplyr::distinct()
-
+  
   dplyr::tbl(con, "filtered") |>
     tibble::as_tibble() |>
     dplyr::select("genome_drug.genome_id", "genome_drug.antibiotic",
@@ -1267,7 +1276,7 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){
     dplyr::left_join(drug_abbr, by = c("cleaned_drug" = "drug")) |>
     dplyr::left_join(class_abbr, by = "drug_class") |>
     DBI::dbWriteTable(conn = con, name = "filtered", overwrite = TRUE)
-
+  
   resistance_summary <- dplyr::tbl(con, "filtered") |>
     tibble::as_tibble()  |>
     dplyr::filter(genome_drug.resistant_phenotype == "Resistant") |>
@@ -1276,8 +1285,9 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){
       num_resistant_classes = dplyr::n_distinct(drug_class),
       resistant_classes = paste(unique(class_abbr), collapse = "_")
     )
-
+  
   year_breaks <- seq(1980, 2023, by = 5)
+  
   dplyr::tbl(con, "filtered") |>
     tibble::as_tibble() |>
     dplyr::mutate(genome_drug.antibiotic = cleaned_drug) |>
@@ -1301,6 +1311,74 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){
                                  labels = paste(year_breaks[-length(year_breaks)],
                                                 year_breaks[-1] - 1, sep = "-"))) |>
     DBI::dbWriteTable(conn = con, name = "cleaned_metadata", overwrite = TRUE)
+  
+  # Parquet output path
+  metadata_parquet <- file.path(path, "metadata.parquet")  # cleaned_metadata exported as 'metadata'
+
+  # Also export AMR/genome/original metadata
+  amr_phenotype_parquet          <- file.path(path, "amr_phenotype.parquet")
+  genome_data_parquet            <- file.path(path, "genome_data.parquet")
+  original_metadata_parquet      <- file.path(path, "original_metadata.parquet")
+  
+  writeCompressedParquet <- function(df, path) {
+    arrow::write_parquet(
+      df,
+      path,
+      compression = "zstd",
+      compression_level = 9,
+      use_dictionary = TRUE
+    )
+  }
+  
+  db_name <- duckdb_path |> stringr::str_split_i(".duckdb", i = 1) |> paste0("_parquet.duckdb")
+  con_new <- DBI::dbConnect(duckdb::duckdb(), db_name)
+  on.exit(try(DBI::dbDisconnect(con_new, shutdown = FALSE), silent = TRUE), add = TRUE)
+  
+  # cleaned_metadata -> parquet + view (as metadata)
+  DBI::dbReadTable(con, "cleaned_metadata") |> writeCompressedParquet(metadata_parquet)
+  DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW metadata AS SELECT * FROM read_parquet('%s')", metadata_parquet))
+  
+  # debug/complete views: amr_phenotype, genome_data, original_metadata
+  DBI::dbReadTable(con, "amr_phenotype") |> writeCompressedParquet(amr_phenotype_parquet)
+  DBI::dbReadTable(con, "genome_data")   |> writeCompressedParquet(genome_data_parquet)
+  DBI::dbReadTable(con, "metadata")      |> writeCompressedParquet(original_metadata_parquet)
+  
+  DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW amr_phenotype AS SELECT * FROM read_parquet('%s')", amr_phenotype_parquet))
+  DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW genome_data AS SELECT * FROM read_parquet('%s')", genome_data_parquet))
+  DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW original_metadata AS SELECT * FROM read_parquet('%s')", original_metadata_parquet))
+  
+  invisible(TRUE)
+}
+
+# Clean feature matrices, then save as Parquet files
+#'
+#' @param duckdb_path 
+#' @param path 
+#'
+#' @returns
+#' @export
+#'
+#' @examples
+cleanData <- function(duckdb_path, path){
+  duckdb_path  <- normalizePath(duckdb_path)
+  # If no explicit path is provided (or a generic one), choose results/<bug>/ when
+  # the DuckDB lives under data/<bug>/, or else fall back to the DuckDB directory.
+  if (missing(path) || path %in% c(".", "results", "results/")) {
+    bug_dir <- dirname(duckdb_path)
+    mapped_results <- sub(
+      paste0(.Platform$file.sep, "data", .Platform$file.sep),
+      paste0(.Platform$file.sep, "results", .Platform$file.sep),
+      bug_dir,
+      fixed = TRUE
+    )
+    path <- if (!identical(mapped_results, bug_dir)) mapped_results else bug_dir
+  }
+
+  path <- normalizePath(path, mustWork = FALSE)
+  if (!dir.exists(path)) dir.create(path, recursive = TRUE)
+
+  con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path)
+  on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE)
 
   # Parquet output paths
   genes_parquet                  <- file.path(path, "gene_count.parquet")
@@ -1312,18 +1390,11 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){
   proteins_parquet               <- file.path(path, "protein_count.parquet")
   domains_parquet                <- file.path(path, "domain_count.parquet")
 
-  metadata_parquet               <- file.path(path, "metadata.parquet")  # cleaned_metadata exported as 'metadata'
-
   domain_names_parquet           <- file.path(path, "domain_names.parquet")
   protein_names_parquet          <- file.path(path, "protein_names.parquet")
 
   protein_cluster_seq_parquet    <- file.path(path, "protein_seqs.parquet")
 
-  # Also export AMR/genome/original metadata
-  amr_phenotype_parquet          <- file.path(path, "amr_phenotype.parquet")
-  genome_data_parquet            <- file.path(path, "genome_data.parquet")
-  original_metadata_parquet      <- file.path(path, "original_metadata.parquet")
-
   writeCompressedParquet <- function(df, path) {
     arrow::write_parquet(
       df,
@@ -1370,10 +1441,6 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){
     writeCompressedParquet(struct_parquet)
   DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW struct AS SELECT * FROM read_parquet('%s')", struct_parquet))
 
-  # cleaned_metadata -> parquet + view (as metadata)
-  DBI::dbReadTable(con, "cleaned_metadata") |> writeCompressedParquet(metadata_parquet)
-  DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW metadata AS SELECT * FROM read_parquet('%s')", metadata_parquet))
-
   # names/seq tables -> parquet + views
   DBI::dbReadTable(con, "gene_names") |> writeCompressedParquet(gene_names_parquet)
   DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW gene_names AS SELECT * FROM read_parquet('%s')", gene_names_parquet))
@@ -1397,15 +1464,6 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){
   DBI::dbReadTable(con, "genome_gene_protein") |> writeCompressedParquet(genome_gene_protein_parquet)
   DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW genome_gene_protein AS SELECT * FROM read_parquet('%s')", genome_gene_protein_parquet))
 
-  # debug/complete views: amr_phenotype, genome_data, original_metadata
-  DBI::dbReadTable(con, "amr_phenotype") |> writeCompressedParquet(amr_phenotype_parquet)
-  DBI::dbReadTable(con, "genome_data")   |> writeCompressedParquet(genome_data_parquet)
-  DBI::dbReadTable(con, "metadata")      |> writeCompressedParquet(original_metadata_parquet)
-
-  DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW amr_phenotype AS SELECT * FROM read_parquet('%s')", amr_phenotype_parquet))
-  DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW genome_data AS SELECT * FROM read_parquet('%s')", genome_data_parquet))
-  DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW original_metadata AS SELECT * FROM read_parquet('%s')", original_metadata_parquet))
-
   invisible(TRUE)
 }
 

From c52aaa78a50e4ae956aaa541b6c988554c18f506 Mon Sep 17 00:00:00 2001
From: AbhirupaGhosh <AbhirupaGhosh@users.noreply.github.com>
Date: Thu, 19 Feb 2026 15:56:27 +0000
Subject: [PATCH 2/4] Style code (GHA)

---
 R/data_curation.R   | 428 ++++++++++++++++++++++++++------------------
 R/data_processing.R | 418 +++++++++++++++++++++++-------------------
 vignettes/intro.Rmd | 123 +++++++------
 3 files changed, 549 insertions(+), 420 deletions(-)

diff --git a/R/data_curation.R b/R/data_curation.R
index b1f1af5..0d19113 100644
--- a/R/data_curation.R
+++ b/R/data_curation.R
@@ -45,8 +45,10 @@
   }
 
   # Parse
-  df <- utils::read.table(text = raw_data, sep = "\t", header = TRUE, fill = TRUE,
-                          quote = "", check.names = FALSE, comment.char = "")
+  df <- utils::read.table(
+    text = raw_data, sep = "\t", header = TRUE, fill = TRUE,
+    quote = "", check.names = FALSE, comment.char = ""
+  )
   df <- tibble::as_tibble(df) |>
     dplyr::mutate(dplyr::across(dplyr::everything(), ~ iconv(.x, from = "", to = "UTF-8", sub = "")))
 
@@ -61,7 +63,7 @@
 # Make sure the BV-BRC metadata live where they're supposed to
 .ensure_bvbrc_cache <- function(base_dir = ".",
                                 verbose = TRUE,
-                                cache_rel   = file.path("data", "bvbrc", "bvbrcData.duckdb"),
+                                cache_rel = file.path("data", "bvbrc", "bvbrcData.duckdb"),
                                 cache_table = "bvbrc_bac_data") {
   base_dir <- normalizePath(base_dir, mustWork = FALSE)
   cache_db <- file.path(base_dir, cache_rel)
@@ -115,12 +117,12 @@
   base_dir <- normalizePath(base_dir, mustWork = FALSE)
   data_dir <- file.path(base_dir, "data")
   bvbrc_dir <- file.path(data_dir, "bvbrc")
-  logs_dir  <- file.path(data_dir, "logs")
+  logs_dir <- file.path(data_dir, "logs")
 
   dir.create(bvbrc_dir, recursive = TRUE, showWarnings = FALSE)
-  dir.create(logs_dir,  recursive = TRUE, showWarnings = FALSE)
+  dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE)
 
-  db_path    <- file.path(bvbrc_dir, "bvbrcData.duckdb")
+  db_path <- file.path(bvbrc_dir, "bvbrcData.duckdb")
   table_name <- "bvbrc_bac_data"
   meta_table <- "__meta"
 
@@ -128,10 +130,10 @@
 
   DBI::dbExecute(
     con,
-    glue::glue('CREATE TABLE IF NOT EXISTS {meta_table} (
+    glue::glue("CREATE TABLE IF NOT EXISTS {meta_table} (
                  table_name TEXT PRIMARY KEY,
                  last_updated TIMESTAMP
-               )')
+               )")
   )
 
   # Tiny update check helper
@@ -139,12 +141,14 @@
     res <- tryCatch(
       DBI::dbGetQuery(
         con,
-        glue::glue('SELECT last_updated FROM {meta_table}
-                    WHERE table_name = {DBI::dbQuoteString(con, table_name)}')
+        glue::glue("SELECT last_updated FROM {meta_table}
+                    WHERE table_name = {DBI::dbQuoteString(con, table_name)}")
       ),
       error = function(e) NULL
     )
-    if (is.null(res) || nrow(res) == 0L) return(NA)
+    if (is.null(res) || nrow(res) == 0L) {
+      return(NA)
+    }
     as.POSIXct(res$last_updated[[1]], origin = "1970-01-01", tz = "UTC")
   }
 
@@ -268,7 +272,6 @@
 }
 
 
-
 #' Resolve `query value` from `user_bacs` for [.getGenomeIDs()]
 #'
 #' If query_value is NULL, derive it from user_bacs based on query_type.
@@ -277,7 +280,9 @@
 #' If nothing suitable is found, throw a fit and an error.
 #' @keywords internal
 .resolveQueryValue <- function(query_type, query_value, user_bacs) {
-  if (!is.null(query_value) && nzchar(query_value)) return(query_value)
+  if (!is.null(query_value) && nzchar(query_value)) {
+    return(query_value)
+  }
   if (missing(user_bacs) || length(user_bacs) == 0) {
     stop("Provide query_value or user_bacs for the selected query_type.")
   }
@@ -359,7 +364,7 @@
     stringr::str_replace_all("\\s+", "_") |>
     stringr::str_replace_all("[^A-Za-z0-9._-]", "")
 
-  db_dir  <- file.path(data_dir, bug_dirname)
+  db_dir <- file.path(data_dir, bug_dirname)
   dir.create(db_dir, recursive = TRUE, showWarnings = FALSE)
 
   db_file <- paste0(.generateDBname(user_bacs), ".duckdb")
@@ -397,14 +402,13 @@
                           overwrite = FALSE,
                           image = "danylmb/bvbrc:5.3",
                           verbose = TRUE) {
-  
-  query_type  <- match.arg(query_type)
+  query_type <- match.arg(query_type)
   query_value <- .resolveQueryValue(query_type, query_value, user_bacs)
-  
+
   if (isTRUE(verbose)) {
     message("Querying BV-BRC: ", query_type, " == ", query_value)
   }
-  
+
   # Count
   count_cmd <- paste0(
     "docker run --rm ", image,
@@ -414,10 +418,10 @@
     " --in genome_status,WGS,Complete",
     " --count"
   )
-  count_lines  <- tryCatch(system(count_cmd, intern = TRUE), error = function(e) character())
+  count_lines <- tryCatch(system(count_cmd, intern = TRUE), error = function(e) character())
   count_result <- suppressWarnings(as.integer(if (length(count_lines) >= 2) count_lines[2] else NA_integer_))
   if (isTRUE(verbose) && !is.na(count_result)) message("Count returned: ", count_result)
-  
+
   # Details
   data_cmd <- paste0(
     "docker run --rm ", image,
@@ -429,7 +433,7 @@
   )
   data_raw <- tryCatch(system(data_cmd, intern = TRUE), error = function(e) character())
   if (length(data_raw) == 0L) stop("BV-BRC returned no data for: ", query_type, " = ", query_value)
-  
+
   data_result <- tibble::as_tibble(
     utils::read.table(
       text = data_raw, sep = "\t", header = TRUE, fill = TRUE,
@@ -443,16 +447,16 @@
       `genome.species`     = as.character(`genome.species`),
       `genome.strain`      = as.character(`genome.strain`)
     )
-  
+
   # Per-bug DB path
-  paths   <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite)
+  paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite)
   db_path <- paths$db_path
-  
+
   con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path)
   DBI::dbWriteTable(con, "bac_data", data_result, overwrite = TRUE)
-  
+
   if (isTRUE(verbose)) message("Wrote table 'bac_data' to: ", db_path)
-  
+
   list(count_result = count_result, duckdbConnection = con, table_name = "bac_data")
 }
 
@@ -474,28 +478,28 @@
                               overwrite = FALSE,
                               verbose = TRUE) {
   base_dir <- normalizePath(base_dir, mustWork = FALSE)
-  
+
   if (isTRUE(verbose)) message("Resolving input taxa.")
   bac_input_data <- .retrieveCustomQuery(base_dir = base_dir, user_bacs = user_bacs)
-  
+
   if (is.null(bac_input_data) || nrow(bac_input_data) == 0) {
     message("No valid input provided or no matches found.")
     return(NULL)
   }
-  
+
   # Resolve per-bug DB path
-  paths   <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite)
+  paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite)
   db_path <- paths$db_path
-  
-  bac_data  <- tibble::tibble()
+
+  bac_data <- tibble::tibble()
   taxon_ids <- unique(bac_input_data$genome.taxon_id)
-  
+
   if (isTRUE(verbose)) message("Querying BV-BRC for ", length(taxon_ids), " taxon IDs.")
-  
+
   for (i in seq_along(taxon_ids)) {
     tax <- taxon_ids[i]
     if (isTRUE(verbose)) message("Taxon ", i, "/", length(taxon_ids), ": ", tax)
-    
+
     res <- .getGenomeIDs(
       base_dir    = base_dir,
       query_type  = "taxon_id",
@@ -504,22 +508,22 @@
       overwrite   = TRUE,
       verbose     = verbose
     )
-    
+
     con <- res$duckdbConnection
     tbl <- res$table_name
     each_bac_data <- tibble::as_tibble(DBI::dbReadTable(con, tbl))
     bac_data <- dplyr::bind_rows(bac_data, each_bac_data)
-    
+
     # Close per-iteration connection to avoid open handles piling up
     try(DBI::dbDisconnect(con, shutdown = TRUE), silent = TRUE)
   }
-  
+
   if (nrow(bac_data) > 0) {
     genome_ids <- bac_data |>
       dplyr::distinct(`genome.genome_id`) |>
       dplyr::pull(`genome.genome_id`)
     genome_ids <- genome_ids[!is.na(genome_ids)]
-    
+
     if (length(genome_ids) > 0) {
       if (isTRUE(verbose)) message("Collected ", length(genome_ids), " distinct genome IDs.")
       return(genome_ids)
@@ -578,18 +582,18 @@
                              verbose = TRUE) {
   base_dir <- normalizePath(base_dir, mustWork = FALSE)
   data_dir <- file.path(base_dir, "data")
-  tmp_dir  <- file.path(data_dir, "tmp")
+  tmp_dir <- file.path(data_dir, "tmp")
   dir.create(tmp_dir, recursive = TRUE, showWarnings = FALSE)
-  
+
   if (isTRUE(verbose)) {
     message("Preparing AMR query input for ", length(batch_genome_IDs), " genomes.")
   }
-  
+
   docker_path <- Sys.which("docker")
   if (!nzchar(docker_path)) {
     stop("Docker is not available on your PATH but is required.")
   }
-  
+
   # Generate genome list with p3-echo
   echo_args <- c(
     "run", "--rm",
@@ -599,12 +603,12 @@
   genome_ids_output <- suppressWarnings(
     system2("docker", args = echo_args, stdout = TRUE, stderr = TRUE)
   )
-  
+
   # Write a temporary file in data/tmp/
   tmp_in <- tempfile(tmpdir = tmp_dir, pattern = "genome_drug_ids_", fileext = ".tsv")
   writeLines(genome_ids_output, con = tmp_in)
   tmp_in_mounted <- file.path("/data", "tmp", basename(tmp_in))
-  
+
   # Allow abx_filter to be a single string with spaces OR a vector of args
   abx_args <-
     if (length(abx_filter) == 1L) {
@@ -612,7 +616,7 @@
     } else {
       abx_filter
     }
-  
+
   # Query drug data
   drug_args <- c(
     "run", "--rm",
@@ -622,15 +626,15 @@
     abx_args,
     "--attr", drug_fields
   )
-  
+
   if (isTRUE(verbose)) {
     message("Running AMR query.")
   }
   drug_data <- suppressWarnings(system2("docker", args = drug_args, stdout = TRUE, stderr = TRUE))
-  
+
   # Clean up after yourself
   try(unlink(tmp_in), silent = TRUE)
-  
+
   return(drug_data)
 }
 
@@ -655,18 +659,18 @@
                                verbose = TRUE) {
   base_dir <- normalizePath(base_dir, mustWork = FALSE)
   data_dir <- file.path(base_dir, "data")
-  tmp_dir  <- file.path(data_dir, "tmp")
+  tmp_dir <- file.path(data_dir, "tmp")
   dir.create(tmp_dir, recursive = TRUE, showWarnings = FALSE)
-  
+
   if (isTRUE(verbose)) {
     message("Preparing genome metadata input for ", length(batch_genome_IDs), " genomes.")
   }
-  
+
   docker_path <- Sys.which("docker")
   if (!nzchar(docker_path)) {
     stop("Docker is not available on your PATH but is required.")
   }
-  
+
   # Generate genome list with p3-echo
   echo_args <- c(
     "run", "--rm",
@@ -676,14 +680,14 @@
   genome_ids_output <- suppressWarnings(
     system2("docker", args = echo_args, stdout = TRUE, stderr = TRUE)
   )
-  
+
   tmp_in <- tempfile(tmpdir = tmp_dir, pattern = "genome_ids_", fileext = ".tsv")
   writeLines(genome_ids_output, con = tmp_in)
   tmp_in_mounted <- file.path("/data", "tmp", basename(tmp_in))
-  
+
   # Choose attributes (AMR for this pipeline)
   chosen_fields <- if (identical(filter_type, "AMR")) amr_fields else microtrait_fields
-  
+
   get_args <- c(
     "run", "--rm",
     "-v", paste0(data_dir, ":/data"),
@@ -691,15 +695,15 @@
     "--input", tmp_in_mounted,
     "--attr", chosen_fields
   )
-  
+
   if (isTRUE(verbose)) {
     message("Running genome metadata query.")
   }
   genome_data <- suppressWarnings(system2("docker", args = get_args, stdout = TRUE, stderr = TRUE))
-  
+
   # Cleaning up
   try(unlink(tmp_in), silent = TRUE)
-  
+
   return(genome_data)
 }
 
@@ -735,8 +739,10 @@ retrieveMetadata <- function(user_bacs,
   base_dir <- normalizePath(base_dir, mustWork = FALSE)
 
   if (isTRUE(verbose)) message("Resolving genome IDs for user inputs.")
-  genome_ids <- .retrieveQueryIDs(base_dir = base_dir, user_bacs = user_bacs,
-                                  overwrite = overwrite, verbose = verbose)
+  genome_ids <- .retrieveQueryIDs(
+    base_dir = base_dir, user_bacs = user_bacs,
+    overwrite = overwrite, verbose = verbose
+  )
   if (length(genome_ids) == 0) {
     message("No genome IDs available for the specified inputs.")
     return(NULL)
@@ -753,8 +759,11 @@ retrieveMetadata <- function(user_bacs,
     "pmid,resistant_phenotype,",
     "source,taxon_id,testing_standard"
   )
-  abx_filter <- if (identical(abx, "All")) "--required antibiotic"
-  else paste0("--in antibiotic,", paste(abx, collapse = ","))
+  abx_filter <- if (identical(abx, "All")) {
+    "--required antibiotic"
+  } else {
+    paste0("--in antibiotic,", paste(abx, collapse = ","))
+  }
   amr_fields <- paste0(
     "assembly_accession,assembly_method,",
     "bioproject_accession,biosample_accession,",
@@ -808,7 +817,10 @@ retrieveMetadata <- function(user_bacs,
     ),
     envir = environment()
   )
-  parallel::clusterEvalQ(cluster, { library(tibble); library(dplyr) })
+  parallel::clusterEvalQ(cluster, {
+    library(tibble)
+    library(dplyr)
+  })
 
   if (isTRUE(verbose)) message("Retrieving AMR phenotype data in batches.")
   batch_drug_data <- parallel::parLapply(cluster, genome_batches, function(batch) {
@@ -822,7 +834,10 @@ retrieveMetadata <- function(user_bacs,
     )
   })
   combined_drug_data <- unlist(batch_drug_data, use.names = FALSE)
-  if (length(combined_drug_data) == 0) { message("No drug data returned."); return(NULL) }
+  if (length(combined_drug_data) == 0) {
+    message("No drug data returned.")
+    return(NULL)
+  }
 
   combined_drug_data_tbl <- tibble::as_tibble(
     utils::read.table(
@@ -847,7 +862,10 @@ retrieveMetadata <- function(user_bacs,
     )
   })
   combined_genome_data <- unlist(batch_genome_data, use.names = FALSE)
-  if (length(combined_genome_data) == 0) { message("No genome data returned."); return(NULL) }
+  if (length(combined_genome_data) == 0) {
+    message("No genome data returned.")
+    return(NULL)
+  }
 
   combined_genome_data_tbl <- tibble::as_tibble(
     utils::read.table(
@@ -860,13 +878,14 @@ retrieveMetadata <- function(user_bacs,
     dplyr::mutate(`genome.genome_id` = as.character(`genome.genome_id`))
 
   # Per-bug DB path
-  paths   <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite)
+  paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite)
   db_path <- paths$db_path
 
   logs_dir <- file.path(base_dir, "data", "logs")
   dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE)
   cat(sprintf("[%s] Writing metadata DuckDB: %s\n", Sys.time(), db_path),
-      file = file.path(logs_dir, "bvbrc.log"), append = TRUE)
+    file = file.path(logs_dir, "bvbrc.log"), append = TRUE
+  )
 
   con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path)
   DBI::dbWriteTable(con, "amr_phenotype", combined_drug_data_tbl, overwrite = TRUE)
@@ -892,10 +911,14 @@ retrieveMetadata <- function(user_bacs,
 
 # FASTA sanitizer to ensure Panaroo compatibility with BV-BRC CLI downloads
 .strip_fasta_preamble <- function(fna_path) {
-  if (!file.exists(fna_path)) return(invisible(FALSE))
+  if (!file.exists(fna_path)) {
+    return(invisible(FALSE))
+  }
   txt <- readLines(fna_path, warn = FALSE)
   first <- which(grepl("^\\s*>", txt))[1]
-  if (is.na(first)) return(invisible(FALSE))
+  if (is.na(first)) {
+    return(invisible(FALSE))
+  }
   if (first > 1L) {
     txt <- txt[first:length(txt)]
     txt[1] <- sub("^\\ufeff", "", txt[1])
@@ -907,14 +930,20 @@ retrieveMetadata <- function(user_bacs,
 
 # GFF sanitizer to ensure Panaroo compatibility with BV-BRC CLI downloads
 .sanitize_gff <- function(gff_path) {
-  if (!file.exists(gff_path)) return(invisible(FALSE))
+  if (!file.exists(gff_path)) {
+    return(invisible(FALSE))
+  }
   lines <- readLines(gff_path, warn = FALSE)
-  if (length(lines) == 0L) return(invisible(FALSE))
+  if (length(lines) == 0L) {
+    return(invisible(FALSE))
+  }
   if (!grepl("^##gff-version\\s*3", lines[1])) {
     lines <- c("##gff-version 3", lines)
   }
   out <- vapply(lines, function(line) {
-    if (grepl("^#", line)) return(line)
+    if (grepl("^#", line)) {
+      return(line)
+    }
     parts <- strsplit(line, "[\t ]", perl = TRUE)[[1]]
     if (length(parts) >= 9) {
       paste(c(parts[1:8], paste(parts[9:length(parts)], collapse = " ")), collapse = "\t")
@@ -943,16 +972,19 @@ retrieveMetadata <- function(user_bacs,
                            verbose = TRUE,
                            fallback_to_bvbrc_cache = TRUE) {
   base_dir <- normalizePath(base_dir, mustWork = FALSE)
-  paths    <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs)
-  db_path  <- paths$db_path
-  
+  paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs)
+  db_path <- paths$db_path
+
   con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path)
-  
-  on.exit({
-    # Keep connection open for caller
-    NULL
-  }, add = TRUE)
-  
+
+  on.exit(
+    {
+      # Keep connection open for caller
+      NULL
+    },
+    add = TRUE
+  )
+
   # Good metadata present? Apply AMR filters
   if ("metadata" %in% DBI::dbListTables(con)) {
     if (isTRUE(verbose)) message("Loading metadata for filtering.")
@@ -962,7 +994,7 @@ retrieveMetadata <- function(user_bacs,
       message("No data available in 'metadata'.")
       return(NULL)
     }
-    
+
     # Normalize evidence labels
     initial_metadata <- tibble::as_tibble(initial_metadata) |>
       dplyr::mutate(
@@ -973,13 +1005,13 @@ retrieveMetadata <- function(user_bacs,
           TRUE ~ `genome_drug.evidence`
         )
       )
-    
+
     # AMR and quality filtering
     filtered_metadata <- initial_metadata |>
       dplyr::filter(`genome_drug.evidence` == "Laboratory Method") |>
       dplyr::filter(`genome.genome_quality` == "Good") |>
       dplyr::filter(`genome_drug.resistant_phenotype` %in% c("Resistant", "Susceptible", "Intermediate"))
-    
+
     DBI::dbWriteTable(con, "filtered", filtered_metadata, overwrite = TRUE)
     if (isTRUE(verbose)) {
       message("Post-filter rows: ", nrow(filtered_metadata))
@@ -987,42 +1019,42 @@ retrieveMetadata <- function(user_bacs,
     }
     return(list(duckdbConnection = con, table_name = "filtered"))
   }
-  
+
   # Attempt BV-BRC cache location
   if (!isTRUE(fallback_to_bvbrc_cache)) {
     DBI::dbDisconnect(con, shutdown = TRUE)
     stop("No 'metadata' table found in ", db_path, ". Run retrieveMetadata() first.")
   }
-  
+
   if (isTRUE(verbose)) {
     message("No 'metadata' in per-selection DB. Falling back to BV-BRC cache at data/bvbrc/.")
   }
-  
+
   cache_db <- file.path(base_dir, "data", "bvbrc", "bvbrcData.duckdb")
   if (!file.exists(cache_db)) {
     DBI::dbDisconnect(con, shutdown = TRUE)
     stop("BV-BRC cache not found at: ", cache_db, ". Run .updateBVBRCdata() first.")
   }
-  
+
   con_cache <- DBI::dbConnect(duckdb::duckdb(), dbdir = cache_db)
   on.exit(try(DBI::dbDisconnect(con_cache, shutdown = TRUE), silent = TRUE), add = TRUE)
-  
+
   if (!"bvbrc_bac_data" %in% DBI::dbListTables(con_cache)) {
     DBI::dbDisconnect(con, shutdown = TRUE)
     stop("Table 'bvbrc_bac_data' not found in BV-BRC cache: ", cache_db)
   }
-  
+
   bv <- tibble::as_tibble(DBI::dbReadTable(con_cache, "bvbrc_bac_data"))
-  
-  
+
+
   # Derive matches from user_bacs (taxon IDs or species)
   sel <- tibble::tibble(`genome.genome_id` = character())
-  
+
   for (v in user_bacs) {
     if (suppressWarnings(!is.na(as.numeric(v)))) {
       # numeric taxon_id match
       matches <- bv[bv$`genome.taxon_id` == v |
-                      bv$`genome.taxon_id` == as.numeric(v), , drop = FALSE]
+        bv$`genome.taxon_id` == as.numeric(v), , drop = FALSE]
     } else {
       # species substring (case-insensitive)
       matches <- bv[stringr::str_detect(
@@ -1030,7 +1062,7 @@ retrieveMetadata <- function(user_bacs,
         stringr::fixed(v, ignore_case = TRUE)
       ), , drop = FALSE]
     }
-    
+
     if (nrow(matches)) {
       sel <- dplyr::bind_rows(
         sel,
@@ -1041,22 +1073,22 @@ retrieveMetadata <- function(user_bacs,
     }
   }
   sel <- dplyr::distinct(sel)
-  
+
   if (nrow(sel) == 0L) {
     DBI::dbDisconnect(con, shutdown = TRUE)
     stop("No genomes matched user_bacs in BV-BRC cache.")
   }
-  
+
   # Minimal 'filtered' for downstream steps (downloaders & genomeList)
   minimal_filtered <- sel
   # Ensure expected column name used downstream:
   # retrieveGenomes() reads "filtered" and expects `genome.genome_id` to exist.
   DBI::dbWriteTable(con, "filtered", minimal_filtered, overwrite = TRUE)
-  
+
   if (isTRUE(verbose)) {
     message("Wrote table 'filtered' to: ", db_path)
   }
-  
+
   list(duckdbConnection = con, table_name = "filtered")
 }
 
@@ -1068,9 +1100,12 @@ retrieveMetadata <- function(user_bacs,
 # Prefer bash
 .pick_shell <- function(image) {
   chk <- suppressWarnings(system2("docker",
-                                  c("run", "--rm", image, "sh", "-lc",
-                                    "command -v bash >/dev/null || echo NOBASH"),
-                                  stdout = TRUE, stderr = TRUE))
+    c(
+      "run", "--rm", image, "sh", "-lc",
+      "command -v bash >/dev/null || echo NOBASH"
+    ),
+    stdout = TRUE, stderr = TRUE
+  ))
   if (length(chk) && any(grepl("NOBASH", chk))) "sh" else "bash"
 }
 
@@ -1117,21 +1152,28 @@ retrieveMetadata <- function(user_bacs,
 .ftp_download_one <- function(genomeID, out_dir, tries = 3L, min_bytes = 100) {
   files <- c(".fna", ".PATRIC.faa", ".PATRIC.gff")
   dests <- file.path(out_dir, paste0(genomeID, files))
-  if (.is_complete_set(out_dir, genomeID, min_bytes)) return(TRUE)
+  if (.is_complete_set(out_dir, genomeID, min_bytes)) {
+    return(TRUE)
+  }
 
   get_one <- function(url, dest) {
     if (nzchar(Sys.which("wget"))) {
       res <- suppressWarnings(system2("wget",
-                                      c("-q", "-O", shQuote(dest), shQuote(url)),
-                                      stdout = TRUE, stderr = TRUE))
-      st <- attr(res, "status"); if (is.null(st)) st <- 0L
+        c("-q", "-O", shQuote(dest), shQuote(url)),
+        stdout = TRUE, stderr = TRUE
+      ))
+      st <- attr(res, "status")
+      if (is.null(st)) st <- 0L
       return(st == 0L && file.exists(dest) && file.info(dest)$size > min_bytes)
     } else if (nzchar(Sys.which("curl"))) {
-      curl_args <- if (startsWith(url, "ftps://"))
-        c("--ssl-reqd", "-L", "-o", shQuote(dest), shQuote(url)) else
-          c("-L", "-o", shQuote(dest), shQuote(url))
+      curl_args <- if (startsWith(url, "ftps://")) {
+        c("--ssl-reqd", "-L", "-o", shQuote(dest), shQuote(url))
+      } else {
+        c("-L", "-o", shQuote(dest), shQuote(url))
+      }
       res <- suppressWarnings(system2("curl", curl_args, stdout = TRUE, stderr = TRUE))
-      st <- attr(res, "status"); if (is.null(st)) st <- 0L
+      st <- attr(res, "status")
+      if (is.null(st)) st <- 0L
       return(st == 0L && file.exists(dest) && file.info(dest)$size > min_bytes)
     }
     FALSE
@@ -1140,13 +1182,27 @@ retrieveMetadata <- function(user_bacs,
   for (ext_i in seq_along(files)) {
     dest <- dests[ext_i]
     if (file.exists(dest) && file.info(dest)$size > min_bytes) next
-    ftps  <- sprintf("ftps://ftp.bv-brc.org/genomes/%s/%s%s", genomeID, genomeID, files[ext_i])
+    ftps <- sprintf("ftps://ftp.bv-brc.org/genomes/%s/%s%s", genomeID, genomeID, files[ext_i])
     https <- sprintf("https://ftp.bv-brc.org/genomes/%s/%s%s", genomeID, genomeID, files[ext_i])
 
     ok <- FALSE
-    for (k in 1:tries) { if (get_one(ftps,  dest)) { ok <- TRUE; break } }
-    if (!ok)            for (k in 1:2)     { if (get_one(https, dest)) { ok <- TRUE; break } }
-    if (!ok) return(FALSE)
+    for (k in 1:tries) {
+      if (get_one(ftps, dest)) {
+        ok <- TRUE
+        break
+      }
+    }
+    if (!ok) {
+      for (k in 1:2) {
+        if (get_one(https, dest)) {
+          ok <- TRUE
+          break
+        }
+      }
+    }
+    if (!ok) {
+      return(FALSE)
+    }
   }
   .is_complete_set(out_dir, genomeID, min_bytes)
 }
@@ -1155,7 +1211,7 @@ retrieveMetadata <- function(user_bacs,
 
 # p3-dump-genomes to fetch FASTA and .gto files
 .cli_dump_fastas_gto_chunk <- function(image, out_dir, genome_ids, tag, tries = 3L) {
-  out_dir  <- normalizePath(out_dir, mustWork = FALSE)
+  out_dir <- normalizePath(out_dir, mustWork = FALSE)
   dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
   ids_file <- file.path(out_dir, paste0("ids_", tag, ".txt"))
   writeLines(genome_ids, ids_file)
@@ -1164,12 +1220,15 @@ retrieveMetadata <- function(user_bacs,
   mount <- .docker_path(out_dir)
 
   # Safety against Windows-specific CRLF lines before `p3-dump-genomes`
-  sh_cmd <- sprintf('tr -d "\\r" < /out/%s | p3-dump-genomes --outDir /out --fasta --prot --gto -',
-                    basename(ids_file))
+  sh_cmd <- sprintf(
+    'tr -d "\\r" < /out/%s | p3-dump-genomes --outDir /out --fasta --prot --gto -',
+    basename(ids_file)
+  )
 
-  args   <- c("run", "--rm", "-v", paste0(mount, ":/out"), image, shell, "-lc", shQuote(sh_cmd))
+  args <- c("run", "--rm", "-v", paste0(mount, ":/out"), image, shell, "-lc", shQuote(sh_cmd))
   res <- suppressWarnings(system2("docker", args = args, stdout = TRUE, stderr = TRUE))
-  st  <- attr(res, "status"); if (is.null(st)) st <- 0L
+  st <- attr(res, "status")
+  if (is.null(st)) st <- 0L
 
   if (st != 0L && tries > 1L) {
     Sys.sleep(1)
@@ -1197,57 +1256,57 @@ retrieveMetadata <- function(user_bacs,
 
 # Export GFF from GTO per genomes in each chunk
 .cli_export_gff_chunk <- function(image, out_dir, genome_ids, tag, tries = 3L) {
-  out_dir  <- normalizePath(out_dir, mustWork = FALSE)
+  out_dir <- normalizePath(out_dir, mustWork = FALSE)
   ids_file <- file.path(out_dir, paste0("ids_", tag, ".txt"))
   writeLines(genome_ids, ids_file)
 
-  exporter    <- "/usr/bin/rast-export-genome"
-  shell       <- .pick_shell(image)
-  mount       <- .docker_path(out_dir)
+  exporter <- "/usr/bin/rast-export-genome"
+  shell <- .pick_shell(image)
+  mount <- .docker_path(out_dir)
   stderr_file <- file.path(out_dir, paste0("gff_chunk_", tag, ".stderr.txt"))
 
   # GFFs from BV-BRC .gto export are not directly compatible in Panaroo
   # This block reformats the contig IDs and ensures .gff/.fna pairs work together
   sh_cmd <- paste0(
-    'set -euo pipefail; ',
-    'fail_n=0; : > /out/', basename(stderr_file), '; ',
+    "set -euo pipefail; ",
+    "fail_n=0; : > /out/", basename(stderr_file), "; ",
     'while IFS= read -r b || [ -n "$b" ]; do ',
     '  b=${b%$\'\\r\'}; [ -n "$b" ] || continue; ',
     '  gto="/out/${b}.gto"; gff="/out/${b}.PATRIC.gff"; map="/out/${b}.orig2id.tsv"; ',
-    '  if [ ! -s "$gto" ]; then echo "MISSING_GTO $b" >>/out/', basename(stderr_file), '; continue; fi; ',
+    '  if [ ! -s "$gto" ]; then echo "MISSING_GTO $b" >>/out/', basename(stderr_file), "; continue; fi; ",
 
     # Export GFF
     '  [ -s "$gff" ] || ', exporter, ' -i "$gto" -o "$gff" gff ',
-    '     || { echo "EXPORT_FAIL_GFF $b" >>/out/', basename(stderr_file), '; fail_n=$((fail_n+1)); continue; }; ',
+    '     || { echo "EXPORT_FAIL_GFF $b" >>/out/', basename(stderr_file), "; fail_n=$((fail_n+1)); continue; }; ",
 
     # Build mapping original_id -> id, and default to id if original_id missing
-    '  if command -v jq >/dev/null 2>&1; then ',
+    "  if command -v jq >/dev/null 2>&1; then ",
     '    jq -r \'.contigs[] | [(.original_id // .id), .id] | @tsv\' "$gto" > "$map"; ',
-    '  else ',
+    "  else ",
     '    python3 - "$gto" > "$map" <<\'PY\'\n',
-    'import sys, json\n',
-    'g = json.load(open(sys.argv[1]))\n',
+    "import sys, json\n",
+    "g = json.load(open(sys.argv[1]))\n",
     'for c in g.get("contigs", []):\n',
     '    o = c.get("original_id") or c.get("id")\n',
     '    i = c.get("id")\n',
-    '    if o and i:\n',
+    "    if o and i:\n",
     '        print(f"{o}\\t{i}")\n',
-    'PY\n',
-    '  fi; ',
+    "PY\n",
+    "  fi; ",
 
     # Relabel GFF sequence IDs: original_id -> id
-    '  awk \'FNR==NR{m[$1]=$2; next} ',
-    '       /^##sequence-region/ { if ($2 in m) {$2=m[$2]} print; next } ',
-    '       /^#/ { print; next } ',
+    "  awk 'FNR==NR{m[$1]=$2; next} ",
+    "       /^##sequence-region/ { if ($2 in m) {$2=m[$2]} print; next } ",
+    "       /^#/ { print; next } ",
     '       { if ($1 in m) $1=m[$1]; print }\' "$map" "$gff" > "${gff}.tmp" && mv "${gff}.tmp" "$gff"; ',
-
-    'done < /out/', basename(ids_file), '; ',
-    'exit 0'
+    "done < /out/", basename(ids_file), "; ",
+    "exit 0"
   )
 
   args <- c("run", "--rm", "-v", paste0(mount, ":/out"), image, shell, "-lc", shQuote(sh_cmd))
-  res  <- suppressWarnings(system2("docker", args = args, stdout = TRUE, stderr = TRUE))
-  st   <- attr(res, "status"); if (is.null(st)) st <- 0L
+  res <- suppressWarnings(system2("docker", args = args, stdout = TRUE, stderr = TRUE))
+  st <- attr(res, "status")
+  if (is.null(st)) st <- 0L
 
   if (st != 0L && tries > 1L) {
     Sys.sleep(1)
@@ -1290,14 +1349,15 @@ retrieveGenomes <- function(base_dir = ".",
                             cli_gff_workers = 4L,
                             chunk_size = 50L,
                             verbose = TRUE) {
-
-  method   <- match.arg(method)
+  method <- match.arg(method)
   base_dir <- normalizePath(base_dir, mustWork = FALSE)
 
   # IDs from .filterGenomes()
   if (isTRUE(verbose)) message("Filtering genomes before download.")
   f_out <- .filterGenomes(base_dir = base_dir, user_bacs = user_bacs, verbose = verbose)
-  if (is.null(f_out)) return(character())
+  if (is.null(f_out)) {
+    return(character())
+  }
 
   con <- f_out$duckdbConnection
   tbl <- f_out$table_name
@@ -1306,12 +1366,12 @@ retrieveGenomes <- function(base_dir = ".",
     dplyr::pull(`genome.genome_id`)
 
   # Set up the paths and such
-  paths       <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs)
-  bug_dir     <- dirname(paths$db_path)
+  paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs)
+  bug_dir <- dirname(paths$db_path)
   genome_path <- file.path(bug_dir, "genomes")
-  logs_dir    <- file.path(base_dir, "data", "logs")
+  logs_dir <- file.path(base_dir, "data", "logs")
   dir.create(genome_path, recursive = TRUE, showWarnings = FALSE)
-  dir.create(logs_dir,    recursive = TRUE, showWarnings = FALSE)
+  dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE)
 
   # Adding support for resuming a download
   if (isTRUE(skip_existing)) {
@@ -1322,10 +1382,12 @@ retrieveGenomes <- function(base_dir = ".",
 
   if (length(ids) == 0L) {
     if (isTRUE(verbose)) message("All genomes already complete.")
-    all_complete <- .list_complete(genome_path,
-                                   tibble::as_tibble(DBI::dbReadTable(con, tbl)) |>
-                                     dplyr::distinct(`genome.genome_id`) |>
-                                     dplyr::pull(`genome.genome_id`))
+    all_complete <- .list_complete(
+      genome_path,
+      tibble::as_tibble(DBI::dbReadTable(con, tbl)) |>
+        dplyr::distinct(`genome.genome_id`) |>
+        dplyr::pull(`genome.genome_id`)
+    )
     return(all_complete)
   }
 
@@ -1334,18 +1396,23 @@ retrieveGenomes <- function(base_dir = ".",
     if (isTRUE(verbose)) message("Trying FTPS download. Workers=", ftp_workers)
     future::plan(future::multisession, workers = max(1, ftp_workers))
     ft_ok <- future.apply::future_lapply(ids, function(gid) .ftp_download_one(gid, genome_path),
-                                         future.seed = TRUE)
+      future.seed = TRUE
+    )
     ok_ids <- ids[unlist(ft_ok)]
     if (isTRUE(verbose)) message("Complete file sets for ", length(ok_ids), " genomes (FTP).")
     return(c(ok_ids, .list_complete(genome_path, setdiff(ids, ok_ids))))
   }
 
   # CLI for FASTA, FAA, and GTO, then GFF from GTO
-  chunks <- split(ids, ceiling(seq_along(ids)/chunk_size))
+  chunks <- split(ids, ceiling(seq_along(ids) / chunk_size))
 
   # Parallel chunk containers
-  if (isTRUE(verbose)) message("CLI being run in parallel for ", length(chunks),
-                               " data chunks.")
+  if (isTRUE(verbose)) {
+    message(
+      "CLI being run in parallel for ", length(chunks),
+      " data chunks."
+    )
+  }
   future::plan(future::multisession, workers = max(1, cli_fasta_workers))
   fa_res <- future.apply::future_mapply(
     FUN = function(vec, tag) .cli_dump_fastas_gto_chunk(image, genome_path, vec, tag),
@@ -1355,8 +1422,12 @@ retrieveGenomes <- function(base_dir = ".",
   if (!all(fa_res) && isTRUE(verbose)) warning(sum(!fa_res), " data chunks failed.")
 
   # GFF extraction in parallel containers
-  if (isTRUE(verbose)) message("GFF extraction being run in parallel for ",
-                               length(chunks), " data chunks.")
+  if (isTRUE(verbose)) {
+    message(
+      "GFF extraction being run in parallel for ",
+      length(chunks), " data chunks."
+    )
+  }
   future::plan(future::multisession, workers = max(1, cli_gff_workers))
   g_res <- future.apply::future_mapply(
     FUN = function(vec, tag) .cli_export_gff_chunk(image, genome_path, vec, tag),
@@ -1367,8 +1438,12 @@ retrieveGenomes <- function(base_dir = ".",
 
   # Success set: .fna + .PATRIC.faa + .PATRIC.gff all present per isolate
   ok_ids <- ids[vapply(ids, .is_complete_set, logical(1), dir = genome_path)]
-  if (isTRUE(verbose)) message("Complete file sets downloaded for ",
-                               length(ok_ids), " genomes.")
+  if (isTRUE(verbose)) {
+    message(
+      "Complete file sets downloaded for ",
+      length(ok_ids), " genomes."
+    )
+  }
   ok_ids
 }
 
@@ -1387,15 +1462,14 @@ retrieveGenomes <- function(base_dir = ".",
 genomeList <- function(base_dir = ".",
                        user_bacs,
                        verbose = TRUE) {
-
   base_dir <- normalizePath(base_dir, mustWork = FALSE)
-  paths    <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs)
-  db_path  <- paths$db_path
-  bug_dir  <- dirname(db_path)
+  paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs)
+  db_path <- paths$db_path
+  bug_dir <- dirname(db_path)
 
   genome_path <- file.path(bug_dir, "genomes")
-  files_all   <- list.files(genome_path, full.names = TRUE)
-  files_all   <- files_all[file.info(files_all)$size > 100]
+  files_all <- list.files(genome_path, full.names = TRUE)
+  files_all <- files_all[file.info(files_all)$size > 100]
 
   # Separate by type
   gff_files <- files_all[grepl("\\.PATRIC\\.gff$", files_all)]
@@ -1414,14 +1488,18 @@ genomeList <- function(base_dir = ".",
     faa_path <- file.path(genome_path, paste0(genomeID, ".PATRIC.faa"))
 
     data.frame(
-      genome_id   = genomeID,
-      gff_path    = if (file.exists(gff_path) && file.info(gff_path)$size > 100) gff_path else NA,
-      fna_path    = if (file.exists(fna_path) && file.info(fna_path)$size > 100) fna_path else NA,
-      faa_path    = if (file.exists(faa_path) && file.info(faa_path)$size > 100) faa_path else NA,
+      genome_id = genomeID,
+      gff_path = if (file.exists(gff_path) && file.info(gff_path)$size > 100) gff_path else NA,
+      fna_path = if (file.exists(fna_path) && file.info(fna_path)$size > 100) fna_path else NA,
+      faa_path = if (file.exists(faa_path) && file.info(faa_path)$size > 100) faa_path else NA,
       panaroo_input = if (
         file.exists(gff_path) && file.exists(fna_path) && file.exists(faa_path) &&
-        file.info(gff_path)$size > 100 && file.info(fna_path)$size > 100 && file.info(faa_path)$size > 100
-      ) paste(gff_path, fna_path) else NA,
+          file.info(gff_path)$size > 100 && file.info(fna_path)$size > 100 && file.info(faa_path)$size > 100
+      ) {
+        paste(gff_path, fna_path)
+      } else {
+        NA
+      },
       stringsAsFactors = FALSE
     )
   })
@@ -1480,7 +1558,7 @@ prepareGenomes <- function(user_bacs,
                            method = c("ftp", "cli"),
                            overwrite = FALSE,
                            verbose = TRUE) {
-  method   <- match.arg(method)
+  method <- match.arg(method)
   base_dir <- normalizePath(base_dir, mustWork = FALSE)
 
   # Ensure the BV-BRC metadata cache exists, fetch if missing
diff --git a/R/data_processing.R b/R/data_processing.R
index c417e52..01b47b8 100644
--- a/R/data_processing.R
+++ b/R/data_processing.R
@@ -81,7 +81,7 @@
 
   # Convert to container-visible paths
   genome_filepath_cont <- .to_container(genome_filepath_host, host_root = mount_host, container_root = mount_cont)
-  output_dir_cont      <- .to_container(output_dir_host,      host_root = mount_host, container_root = mount_cont)
+  output_dir_cont <- .to_container(output_dir_host, host_root = mount_host, container_root = mount_cont)
 
   # Run Panaroo in Docker
   cmd_args <- c(
@@ -99,14 +99,17 @@
     "--remove-invalid-genes",
     "--core_threshold", as.character(core_threshold),
     "--len_dif_percent", as.character(len_dif_percent),
-    "--threshold",      as.character(cluster_threshold),
-    "-f",               as.character(family_seq_identity),
-    "-t",               as.character(panaroo_threads_per_job)
+    "--threshold", as.character(cluster_threshold),
+    "-f", as.character(family_seq_identity),
+    "-t", as.character(panaroo_threads_per_job)
   )
 
-  res <- tryCatch({
-    system2("docker", args = cmd_args, stdout = TRUE, stderr = TRUE)
-  }, error = function(e) e)
+  res <- tryCatch(
+    {
+      system2("docker", args = cmd_args, stdout = TRUE, stderr = TRUE)
+    },
+    error = function(e) e
+  )
 
   if (inherits(res, "error")) {
     stop(sprintf("Docker/Panaroo failed to launch: %s", res$message))
@@ -179,7 +182,6 @@
                         family_seq_identity = 0.5,
                         threads = 8,
                         split_jobs = FALSE) {
-
   duckdb_path <- normalizePath(duckdb_path)
   con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path)
   on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE)
@@ -213,7 +215,7 @@
   filtered_panaroo_input <- sapply(split_files[unlist(valid_entries)], paste, collapse = " ")
 
   total_lines <- length(filtered_panaroo_input)
-  batch_size  <- if (isTRUE(split_jobs)) ceiling(total_lines / 5) else total_lines
+  batch_size <- if (isTRUE(split_jobs)) ceiling(total_lines / 5) else total_lines
   panaroo_batches <- split(filtered_panaroo_input, ceiling(seq_along(filtered_panaroo_input) / batch_size))
 
   n_jobs <- length(panaroo_batches)
@@ -265,8 +267,7 @@
                           len_dif_percent = 0.95,
                           cluster_threshold = 0.95,
                           family_seq_identity = 0.5,
-                          threads = 8){
-
+                          threads = 8) {
   input_path <- .docker_path(input_path)
 
   # Fail fast if Docker is missing
@@ -302,9 +303,9 @@
       "--merge_paralogs",
       "--core_threshold", as.character(core_threshold),
       "--len_dif_percent", as.character(len_dif_percent),
-      "--threshold",      as.character(cluster_threshold),
-      "-f",               as.character(family_seq_identity),
-      "-t",               as.character(threads)
+      "--threshold", as.character(cluster_threshold),
+      "-f", as.character(family_seq_identity),
+      "-t", as.character(threads)
     )
 
     system2("docker", args = cmd_args, stdout = TRUE, stderr = TRUE)
@@ -314,7 +315,6 @@
 }
 
 
-
 #' Load Panaroo gene presence/absence table into DuckDB
 #'
 #' Reads `gene_presence_absence.csv` and constructs a genome-by-gene count
@@ -326,13 +326,13 @@
 #' @return A tibble containing the gene count matrix.
 #'
 #' @keywords internal
-.panaroo2geneTable <- function(panaroo_output_path, duckdb_path){
+.panaroo2geneTable <- function(panaroo_output_path, duckdb_path) {
   filepath <- file.path(normalizePath(panaroo_output_path), "gene_presence_absence.csv")
   duckdb_path <- normalizePath(duckdb_path)
   con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path)
   on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE)
 
-  gene_count <- read.table(filepath, sep=",", header=TRUE, fill=TRUE, quote="") |>
+  gene_count <- read.table(filepath, sep = ",", header = TRUE, fill = TRUE, quote = "") |>
     tibble::as_tibble() |>
     dplyr::select(-c(Non.unique.Gene.name, Annotation)) |>
     tidyr::pivot_longer(cols = -1) |>
@@ -356,13 +356,13 @@
 #' @return A tibble with `Gene` and `Annotation` columns.
 #'
 #' @keywords internal
-.panaroo2geneNames <- function(panaroo_output_path, duckdb_path){
+.panaroo2geneNames <- function(panaroo_output_path, duckdb_path) {
   filepath <- file.path(normalizePath(panaroo_output_path), "gene_presence_absence.csv")
   duckdb_path <- normalizePath(duckdb_path)
   con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path)
   on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE)
 
-  gene_names <- read.table(filepath, sep=",", header=TRUE, fill=TRUE, quote="") |>
+  gene_names <- read.table(filepath, sep = ",", header = TRUE, fill = TRUE, quote = "") |>
     tibble::as_tibble() |>
     dplyr::select(c(Gene, Annotation))
 
@@ -381,15 +381,15 @@
 #' @return A tibble containing the struct matrix.
 #'
 #' @keywords internal
-.panaroo2StructTable <- function(panaroo_output_path, duckdb_path){
+.panaroo2StructTable <- function(panaroo_output_path, duckdb_path) {
   struct_filepath <- file.path(normalizePath(panaroo_output_path), "struct_presence_absence.Rtab")
   duckdb_path <- normalizePath(duckdb_path)
   con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path)
   on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE)
 
-  gene_struct <- read.table(struct_filepath, sep="\t", header=TRUE, fill=TRUE, quote="") |>
+  gene_struct <- read.table(struct_filepath, sep = "\t", header = TRUE, fill = TRUE, quote = "") |>
     tibble::as_tibble() |>
-    tidyr::pivot_longer(cols= -1) |>
+    tidyr::pivot_longer(cols = -1) |>
     tidyr::pivot_wider(names_from = Gene, values_from = value) |>
     dplyr::rename("genome_id" = "name") |>
     dplyr::mutate(genome_id = stringr::str_replace_all(genome_id, c("^X" = "", "\\.PATRIC$" = "")))
@@ -409,7 +409,7 @@
 #' @return Invisibly returns TRUE.
 #'
 #' @keywords internal
-.panaroo2OtherTables <- function(panaroo_output_path, duckdb_path){
+.panaroo2OtherTables <- function(panaroo_output_path, duckdb_path) {
   panaroo_output_path <- normalizePath(panaroo_output_path)
   duckdb_path <- normalizePath(duckdb_path)
   fasta_filepath <- file.path(panaroo_output_path, "pan_genome_reference.fa")
@@ -418,15 +418,19 @@
 
   gene_fasta <- Biostrings::readDNAStringSet(filepath = fasta_filepath)
   DBI::dbWriteTable(con, "gene_ref_seq",
-                    tibble::tibble(name = names(gene_fasta),
-                                   sequence = as.character(gene_fasta)),
-                    overwrite = TRUE)
+    tibble::tibble(
+      name = names(gene_fasta),
+      sequence = as.character(gene_fasta)
+    ),
+    overwrite = TRUE
+  )
 
   readr::read_csv(file.path(panaroo_output_path, "gene_presence_absence.csv")) |>
     dplyr::select(-`Non-unique Gene name`) |>
     tidyr::pivot_longer(-c("Gene", "Annotation"),
-                        names_to = "genome_ids",
-                        values_to = "protein_ids") |>
+      names_to = "genome_ids",
+      values_to = "protein_ids"
+    ) |>
     dplyr::mutate(genome_ids = gsub(".PATRIC", "", genome_ids)) |>
     dplyr::select(genome_ids, Gene, protein_ids) |>
     dplyr::distinct() |>
@@ -447,9 +451,9 @@
 #' @return Invisibly returns TRUE.
 #'
 #' @keywords internal
-.panaroo2duckdb <- function(panaroo_output_path, duckdb_path){
+.panaroo2duckdb <- function(panaroo_output_path, duckdb_path) {
   panaroo_output_path <- normalizePath(panaroo_output_path)
-  duckdb_path         <- normalizePath(duckdb_path)
+  duckdb_path <- normalizePath(duckdb_path)
 
   .panaroo2geneTable(panaroo_output_path, duckdb_path)
   .panaroo2geneNames(panaroo_output_path, duckdb_path)
@@ -484,7 +488,6 @@
                       threads = 0,
                       memory = 0,
                       extra_args = c("-g", "1")) {
-
   # Fail fast if Docker is missing
   if (!nzchar(Sys.which("docker"))) {
     stop("Docker is not available on your PATH but is required to run CD-HIT.")
@@ -530,7 +533,7 @@
     "weizhongli1987/cdhit:4.8.1",
     "cd-hit",
     "-i", .to_container(cdhit_input_faa, mount_host, mount_cont),
-    "-o", .to_container(clustered_faa,   mount_host, mount_cont),
+    "-o", .to_container(clustered_faa, mount_host, mount_cont),
     "-c", as.character(identity),
     "-n", as.character(word_length),
     "-T", as.character(threads),
@@ -540,19 +543,24 @@
   )
 
   message("Running cd-hit via Docker...")
-  output <- tryCatch({
-    system2("docker", args = cmd_args, stdout = TRUE, stderr = TRUE)
-  }, error = function(e) {
-    stop("cd-hit execution failed: ", e$message)
-  })
+  output <- tryCatch(
+    {
+      system2("docker", args = cmd_args, stdout = TRUE, stderr = TRUE)
+    },
+    error = function(e) {
+      stop("cd-hit execution failed: ", e$message)
+    }
+  )
 
   if (!file.exists(clustered_faa)) {
     stop("cd-hit failed: output file not found. Check stderr:\n", paste(output, collapse = "\n"))
   }
   # Ensure .clstr exists (used downstream)
   if (!file.exists(paste0(clustered_faa, ".clstr"))) {
-    stop("cd-hit did not produce the expected .clstr file at: ", paste0(clustered_faa, ".clstr"),
-         "\nFull output:\n", paste(output, collapse = "\n"))
+    stop(
+      "cd-hit did not produce the expected .clstr file at: ", paste0(clustered_faa, ".clstr"),
+      "\nFull output:\n", paste(output, collapse = "\n")
+    )
   }
 
   message("cd-hit completed successfully.")
@@ -672,14 +680,14 @@ runPanaroo2Duckdb <- function(duckdb_path,
 
   if (isTRUE(verbose)) message("Launching Panaroo.")
   .runPanaroo(
-    duckdb_path       = duckdb_path,
-    output_path       = out_dir,
-    core_threshold    = core_threshold,
-    len_dif_percent   = len_dif_percent,
+    duckdb_path = duckdb_path,
+    output_path = out_dir,
+    core_threshold = core_threshold,
+    len_dif_percent = len_dif_percent,
     cluster_threshold = cluster_threshold,
     family_seq_identity = family_seq_identity,
-    threads           = threads,
-    split_jobs        = split_jobs
+    threads = threads,
+    split_jobs = split_jobs
   )
 
   # Identify Panaroo outputs that contain a final_graph.gml file
@@ -730,8 +738,10 @@ runPanaroo2Duckdb <- function(duckdb_path,
 .parseProteinClusters <- function(clustered_faa) {
   clstr <- paste0(clustered_faa, ".clstr")
   if (!file.exists(clstr)) {
-    stop("CD-HIT cluster file not found: ", clstr,
-         "\nEnsure .runCDHIT() completed successfully and produced the .clstr file.")
+    stop(
+      "CD-HIT cluster file not found: ", clstr,
+      "\nEnsure .runCDHIT() completed successfully and produced the .clstr file."
+    )
   }
 
   lines <- data.table::fread(clstr, sep = "\n", header = FALSE)$V1
@@ -740,7 +750,7 @@ runPanaroo2Duckdb <- function(duckdb_path,
 
   for (i in seq_along(cluster_ids)) {
     start <- cluster_ids[i] + 1
-    end   <- if (i < length(cluster_ids)) cluster_ids[i + 1] - 1 else length(lines)
+    end <- if (i < length(cluster_ids)) cluster_ids[i + 1] - 1 else length(lines)
     cluster_lines <- lines[start:end]
 
     # This finds the reference cluster ID and names the cluster with it
@@ -752,8 +762,10 @@ runPanaroo2Duckdb <- function(duckdb_path,
     }
 
     # Pull genome IDs
-    genome_matches <- stringr::str_match(cluster_lines,
-                                         "fig\\|([0-9]+\\.[0-9]+)\\.peg\\.[0-9]+")[, 2]
+    genome_matches <- stringr::str_match(
+      cluster_lines,
+      "fig\\|([0-9]+\\.[0-9]+)\\.peg\\.[0-9]+"
+    )[, 2]
     genome_matches <- genome_matches[!is.na(genome_matches)]
 
     if (length(genome_matches) > 0) {
@@ -810,9 +822,9 @@ buildMatrices <- function(cluster_map) .buildProtMatrices(cluster_map)
   names_faa <- names(cdhit_output_faa) |>
     tibble::as_tibble() |>
     dplyr::mutate(
-      proteinID  = stringr::str_extract(value, "^fig\\|[0-9]+\\.[0-9]+\\.peg\\.[0-9]+"),
-      locus_tag  = stringr::str_match(value, "peg\\.[0-9]+\\|([^\\s]+)")[, 2],
-      proteinName= stringr::str_trim(stringr::str_match(value, "\\|[^\\s]+\\s+(.*?)\\s+\\[")[, 2])
+      proteinID = stringr::str_extract(value, "^fig\\|[0-9]+\\.[0-9]+\\.peg\\.[0-9]+"),
+      locus_tag = stringr::str_match(value, "peg\\.[0-9]+\\|([^\\s]+)")[, 2],
+      proteinName = stringr::str_trim(stringr::str_match(value, "\\|[^\\s]+\\s+(.*?)\\s+\\[")[, 2])
     ) |>
     dplyr::select(-value)
 
@@ -828,47 +840,47 @@ CDHIT2duckdb <- function(duckdb_path,
                          word_length = 5,
                          threads = 0,
                          memory = 0,
-                         extra_args = c("-g", "1")){
-
+                         extra_args = c("-g", "1")) {
   duckdb_path <- normalizePath(duckdb_path)
   con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path)
   on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE)
 
   if (missing(output_path) || output_path %in% c(".", "results", "results/")) {
-    output_path <- dirname(duckdb_path)  # e.g., ./results/<bug>
+    output_path <- dirname(duckdb_path) # e.g., ./results/<bug>
   }
   output_path <- normalizePath(output_path)
 
   cdhit_outputs <- .runCDHIT(duckdb_path,
-                             output_path,
-                             output_prefix = output_prefix,
-                             identity = identity,
-                             word_length = word_length,
-                             threads = threads,
-                             memory = memory,
-                             extra_args = extra_args)
-
-  cluster_map   <- .parseProteinClusters(cdhit_outputs$clustered_faa)
+    output_path,
+    output_prefix = output_prefix,
+    identity = identity,
+    word_length = word_length,
+    threads = threads,
+    memory = memory,
+    extra_args = extra_args
+  )
+
+  cluster_map <- .parseProteinClusters(cdhit_outputs$clustered_faa)
   cluster_count <- .buildProtMatrices(cluster_map)
 
   DBI::dbWriteTable(con, "protein_count", cluster_count, overwrite = TRUE)
 
   cluster_fasta <- cdhit_outputs$cdhit_input_faa
-  cluster_name  <- .clusterNames(cluster_map, cluster_fasta)
+  cluster_name <- .clusterNames(cluster_map, cluster_fasta)
   DBI::dbWriteTable(con, "protein_names", cluster_name, overwrite = TRUE)
 
   clustered_faa <- Biostrings::readAAStringSet(cdhit_outputs$clustered_faa)
   DBI::dbWriteTable(con, "protein_cluster_seq",
-                    tibble::tibble(
-                      name     = names(clustered_faa) |> stringr::str_extract("fig\\|[0-9]+\\.[0-9]+\\.peg\\.[0-9]+"),
-                      sequence = as.character(clustered_faa)
-                    ),
-                    overwrite = TRUE)
+    tibble::tibble(
+      name     = names(clustered_faa) |> stringr::str_extract("fig\\|[0-9]+\\.[0-9]+\\.peg\\.[0-9]+"),
+      sequence = as.character(clustered_faa)
+    ),
+    overwrite = TRUE
+  )
   invisible(TRUE)
 }
 
 
-
 #' Check or install InterProScan data bundle
 #'
 #' Ensures that the InterProScan data directory exists locally, downloading
@@ -885,12 +897,12 @@ CDHIT2duckdb <- function(duckdb_path,
 #'
 #' @keywords internal
 .checkInterProData <- function(
-    version      = "5.76-107.0",
-    dest_dir     = "inst/extdata/interpro",
-    docker_image = sprintf("interpro/interproscan:%s", version),
-    platform     = "linux/amd64",
-    curl_bin     = "curl",
-    verbose      = TRUE
+  version = "5.76-107.0",
+  dest_dir = "inst/extdata/interpro",
+  docker_image = sprintf("interpro/interproscan:%s", version),
+  platform = "linux/amd64",
+  curl_bin = "curl",
+  verbose = TRUE
 ) {
   msg <- function(...) if (verbose) message(sprintf(...))
 
@@ -907,25 +919,29 @@ CDHIT2duckdb <- function(duckdb_path,
   }
 
   # Download bundle if needed
-  tar_url  <- sprintf("http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/%s/alt/interproscan-data-%s.tar.gz",
-                      version, version)
-  md5_url  <- paste0(tar_url, ".md5")
+  tar_url <- sprintf(
+    "http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/%s/alt/interproscan-data-%s.tar.gz",
+    version, version
+  )
+  md5_url <- paste0(tar_url, ".md5")
   tar_path <- file.path(dest_dir, basename(tar_url))
   md5_path <- paste0(tar_path, ".md5")
 
   if (!file.exists(tar_path)) {
     msg("Downloading InterProScan data bundle.")
-    status_tar  <- system2(curl_bin, c("-L", "-o", tar_path, tar_url))
-    status_md5  <- system2(curl_bin, c("-L", "-o", md5_path, md5_url))
-    if (status_tar != 0 || status_md5 != 0)
+    status_tar <- system2(curl_bin, c("-L", "-o", tar_path, tar_url))
+    status_md5 <- system2(curl_bin, c("-L", "-o", md5_path, md5_url))
+    if (status_tar != 0 || status_md5 != 0) {
       stop("Failed to download InterProScan data bundle.")
+    }
   }
 
   msg("Verifying MD5 checksum.")
   md5_expected <- sub("\\s+.*$", "", readLines(md5_path)[1])
-  md5_actual   <- tools::md5sum(tar_path)[[1]]
-  if (!identical(tolower(md5_expected), tolower(md5_actual)))
+  md5_actual <- tools::md5sum(tar_path)[[1]]
+  if (!identical(tolower(md5_expected), tolower(md5_actual))) {
     stop("MD5 checksum mismatch for InterProScan data bundle.")
+  }
 
   msg("Extracting InterProScan data bundle.")
   utils::untar(tar_path, exdir = dest_dir, tar = "internal")
@@ -946,9 +962,11 @@ CDHIT2duckdb <- function(duckdb_path,
 #'
 #' @keywords internal
 .getDfIPRColNames <- function() {
-  c("AccNum", "SeqMD5Digest", "SLength", "Analysis",
+  c(
+    "AccNum", "SeqMD5Digest", "SLength", "Analysis",
     "DB.ID", "SignDesc", "StartLoc", "StopLoc", "Score",
-    "Status", "RunDate", "IPRAcc", "IPRDesc", "placeholder")
+    "Status", "RunDate", "IPRAcc", "IPRDesc", "placeholder"
+  )
 }
 
 #' Internal helpers for reading InterProScan TSV outputs
@@ -993,8 +1011,9 @@ CDHIT2duckdb <- function(duckdb_path,
 #' @keywords internal
 .readIPRscanTsv <- function(filepath) {
   readr::read_tsv(filepath,
-                  col_types = .getDfIPRColTypes(),
-                  col_names = .getDfIPRColNames())
+    col_types = .getDfIPRColTypes(),
+    col_names = .getDfIPRColNames()
+  )
 }
 
 
@@ -1026,7 +1045,7 @@ CDHIT2duckdb <- function(duckdb_path,
                            file_format,
                            docker_image = sprintf("interpro/interproscan:%s", "5.76-107.0")) {
   # Normalize and mount paths
-  path      <- .docker_path(path)
+  path <- .docker_path(path)
   bind_data <- .docker_path(ipr_data_path)
 
   dir.create(file.path(path, "tmp", "iprscan"), recursive = TRUE, showWarnings = FALSE)
@@ -1050,7 +1069,7 @@ CDHIT2duckdb <- function(duckdb_path,
     "-v", paste0(bind_data, ":/opt/interproscan/data"),
     "-w", "/work",
     docker_image,
-    "--input",  .to_container(temp_fasta_file, path, "/work"),
+    "--input", .to_container(temp_fasta_file, path, "/work"),
     "--cpu", as.character(threads),
     "-f", file_format,
     "--appl", appl_str,
@@ -1058,31 +1077,34 @@ CDHIT2duckdb <- function(duckdb_path,
   )
 
 
-  status <- tryCatch({
-    system2(
-      "docker",
-      args = c(
-        "run",
-        "--rm",
-        "--platform", "linux/amd64",  # force amd64 for ARM hosts
-        "-v", paste0(path, ":", "/work"),
-        "-v", paste0(bind_data, ":/opt/interproscan/data"),
-        "-w", "/work",
-        docker_image,
-        "--input",  .to_container(temp_fasta_file, path, "/work"),
-        "--cpu",    as.character(threads),
-        "-f",       file_format,
-        "--appl",   appl_str,
-        "-b",       chunk_out_file_base_cont
-      ),
-      stdout = TRUE,
-      stderr = TRUE
-    )
-  }, error = function(e) {
-    stop(sprintf("InterProScan execution failed for chunk %d: %s", chunk_id, e$message))
-  })
+  status <- tryCatch(
+    {
+      system2(
+        "docker",
+        args = c(
+          "run",
+          "--rm",
+          "--platform", "linux/amd64", # force amd64 for ARM hosts
+          "-v", paste0(path, ":", "/work"),
+          "-v", paste0(bind_data, ":/opt/interproscan/data"),
+          "-w", "/work",
+          docker_image,
+          "--input", .to_container(temp_fasta_file, path, "/work"),
+          "--cpu", as.character(threads),
+          "-f", file_format,
+          "--appl", appl_str,
+          "-b", chunk_out_file_base_cont
+        ),
+        stdout = TRUE,
+        stderr = TRUE
+      )
+    },
+    error = function(e) {
+      stop(sprintf("InterProScan execution failed for chunk %d: %s", chunk_id, e$message))
+    }
+  )
 
-  out_tsv   <- paste0(chunk_out_file_base_host, ".tsv")
+  out_tsv <- paste0(chunk_out_file_base_host, ".tsv")
   out_tsvgz <- paste0(chunk_out_file_base_host, ".tsv.gz")
 
   if (file.exists(out_tsv)) {
@@ -1090,8 +1112,10 @@ CDHIT2duckdb <- function(duckdb_path,
   } else if (file.exists(out_tsvgz)) {
     return(out_tsvgz)
   } else {
-    stop(sprintf("InterProScan produced no output for chunk %d. Checked: %s and %s.\nLast message:\n%s",
-                 chunk_id, out_tsv, out_tsvgz, paste(status, collapse = "\n")))
+    stop(sprintf(
+      "InterProScan produced no output for chunk %d. Checked: %s and %s.\nLast message:\n%s",
+      chunk_id, out_tsv, out_tsvgz, paste(status, collapse = "\n")
+    ))
   }
 }
 
@@ -1100,14 +1124,13 @@ domainFromIPR <- function(duckdb_path,
                           path,
                           out_file_base = "iprscan",
                           appl = c("Pfam"),
-                          ipr_version  = "5.76-107.0",
+                          ipr_version = "5.76-107.0",
                           ipr_dest_dir = "inst/extdata/interpro",
                           ipr_platform = "linux/amd64",
                           auto_prepare_data = TRUE,
                           threads = 8,
                           file_format = "TSV",
                           docker_repo = "interpro/interproscan") {
-
   duckdb_path <- normalizePath(duckdb_path)
   if (missing(path) || path %in% c(".", "results", "results/")) {
     path <- dirname(duckdb_path)
@@ -1126,8 +1149,10 @@ domainFromIPR <- function(duckdb_path,
       verbose      = TRUE
     )
   } else {
-    list(data_dir = file.path(ipr_dest_dir, sprintf("interproscan-%s", ipr_version), "data"),
-         ready = NA)
+    list(
+      data_dir = file.path(ipr_dest_dir, sprintf("interproscan-%s", ipr_version), "data"),
+      ready = NA
+    )
   }
   ipr_data_path <- ipr_info$data_dir
 
@@ -1138,11 +1163,12 @@ domainFromIPR <- function(duckdb_path,
   on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE)
 
   sequences_df <- dplyr::tbl(con, "protein_cluster_seq") |> tibble::as_tibble()
-  if (nrow(sequences_df) == 0L)
+  if (nrow(sequences_df) == 0L) {
     stop("No sequences found in 'protein_cluster_seq'. Please run CDHIT2duckdb() first.")
+  }
 
   # Chunking for parallel (not currently implemented due to memory limits)
-  chunks <- list(sequences_df)   # Force 1 chunk for RAM limits
+  chunks <- list(sequences_df) # Force 1 chunk for RAM limits
 
   # Forcing 1 container operation for RAM limits
   workers <- 1
@@ -1179,24 +1205,28 @@ domainFromIPR <- function(duckdb_path,
 
   # Combine results
   tsvs <- Filter(function(x) !is.null(x) && file.exists(x), results)
-  if (length(tsvs) == 0L)
+  if (length(tsvs) == 0L) {
     stop("InterProScan produced no usable outputs. Check Docker logs above.")
+  }
 
   df_iprscan <- do.call(rbind, lapply(tsvs, .readIPRscanTsv))
 
   # Load processed tables (unchanged)
   DBI::dbWriteTable(con, "domain_names",
-                    df_iprscan |>
-                      dplyr::select(AccNum, DB.ID, SignDesc, IPRAcc, IPRDesc, StartLoc, StopLoc),
-                    overwrite = TRUE)
+    df_iprscan |>
+      dplyr::select(AccNum, DB.ID, SignDesc, IPRAcc, IPRDesc, StartLoc, StopLoc),
+    overwrite = TRUE
+  )
 
   df_protein_domain_pa <- df_iprscan |>
     dplyr::select(AccNum, DB.ID, IPRAcc, placeholder) |>
     dplyr::mutate(domain_ID = stringr::str_glue("{DB.ID}_{IPRAcc}")) |>
     dplyr::distinct() |>
     dplyr::mutate(placeholder = stringr::str_replace_all(placeholder, "-", "1")) |>
-    tidyr::pivot_wider(id_cols = AccNum, names_from = domain_ID, values_from = placeholder,
-                       values_fill = "0") |>
+    tidyr::pivot_wider(
+      id_cols = AccNum, names_from = domain_ID, values_from = placeholder,
+      values_fill = "0"
+    ) |>
     dplyr::group_by(AccNum) |>
     dplyr::summarize(across(everything(), ~ ifelse(any(. == "1"), "1", "0")), .groups = "drop") |>
     dplyr::mutate(across(-AccNum, as.numeric))
@@ -1204,8 +1234,9 @@ domainFromIPR <- function(duckdb_path,
   protein_filter <- dplyr::tbl(con, "protein_count") |> tibble::as_tibble()
   accs <- unique(df_protein_domain_pa$AccNum)
   accs_in_matrix <- intersect(accs, colnames(protein_filter))
-  if (length(accs_in_matrix) == 0L)
+  if (length(accs_in_matrix) == 0L) {
     stop("No InterPro accessions match protein_count columns.")
+  }
 
   protein_filter <- protein_filter |> dplyr::select(genome_id, dplyr::all_of(accs_in_matrix))
   df_protein_domain_pa <- df_protein_domain_pa |>
@@ -1224,16 +1255,16 @@ domainFromIPR <- function(duckdb_path,
 
 # Clean BV-BRC metadata, then save as Parquet files
 #'
-#' @param duckdb_path 
-#' @param path 
-#' @param ref_file_path 
+#' @param duckdb_path
+#' @param path
+#' @param ref_file_path
 #'
 #' @returns
 #' @export
 #'
 #' @examples
-cleanMetaData <- function(duckdb_path, path, ref_file_path = "data_raw/"){
-  duckdb_path  <- normalizePath(duckdb_path)
+cleanMetaData <- function(duckdb_path, path, ref_file_path = "data_raw/") {
+  duckdb_path <- normalizePath(duckdb_path)
   # If no explicit path is provided (or a generic one), choose results/<bug>/ when
   # the DuckDB lives under data/<bug>/, or else fall back to the DuckDB directory.
   if (missing(path) || path %in% c(".", "results", "results/")) {
@@ -1246,54 +1277,56 @@ cleanMetaData <- function(duckdb_path, path, ref_file_path = "data_raw/"){
     )
     path <- if (!identical(mapped_results, bug_dir)) mapped_results else bug_dir
   }
-  
+
   path <- normalizePath(path, mustWork = FALSE)
   if (!dir.exists(path)) dir.create(path, recursive = TRUE)
-  
+
   con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path)
   on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE)
   ref_file_path <- normalizePath(ref_file_path)
-  
+
   clean_drug <- readr::read_tsv(file.path(ref_file_path, "clean_drug.tsv"))
   drug_class <- readr::read_tsv(file.path(ref_file_path, "drug_class.tsv"))
-  drug_abbr  <- readr::read_tsv(file.path(ref_file_path, "drug_abbr.tsv"))
+  drug_abbr <- readr::read_tsv(file.path(ref_file_path, "drug_abbr.tsv"))
   class_abbr <- readr::read_tsv(file.path(ref_file_path, "class_abbr.tsv"))
   clean_countries <- readr::read_tsv(file.path(ref_file_path, "cleaned_bvbrc_countries.tsv")) |>
-    dplyr::select("raw_entry", "clean_name", "short_name")|>
+    dplyr::select("raw_entry", "clean_name", "short_name") |>
     dplyr::distinct()
-  
+
   dplyr::tbl(con, "filtered") |>
     tibble::as_tibble() |>
-    dplyr::select("genome_drug.genome_id", "genome_drug.antibiotic",
-                  "genome_drug.genome_name", "genome_drug.laboratory_typing_method",
-                  "genome_drug.resistant_phenotype", "genome_drug.taxon_id",
-                  "genome_drug.pmid", "genome.collection_year",
-                  "genome.isolation_country", "genome.host_common_name",
-                  "genome.isolation_source", "genome.species") |>
+    dplyr::select(
+      "genome_drug.genome_id", "genome_drug.antibiotic",
+      "genome_drug.genome_name", "genome_drug.laboratory_typing_method",
+      "genome_drug.resistant_phenotype", "genome_drug.taxon_id",
+      "genome_drug.pmid", "genome.collection_year",
+      "genome.isolation_country", "genome.host_common_name",
+      "genome.isolation_source", "genome.species"
+    ) |>
     dplyr::left_join(clean_drug, by = c("genome_drug.antibiotic" = "original_drug")) |>
     dplyr::filter(!is.na(cleaned_drug)) |>
     dplyr::left_join(drug_class, by = c("cleaned_drug" = "drug")) |>
     dplyr::left_join(drug_abbr, by = c("cleaned_drug" = "drug")) |>
     dplyr::left_join(class_abbr, by = "drug_class") |>
     DBI::dbWriteTable(conn = con, name = "filtered", overwrite = TRUE)
-  
+
   resistance_summary <- dplyr::tbl(con, "filtered") |>
-    tibble::as_tibble()  |>
+    tibble::as_tibble() |>
     dplyr::filter(genome_drug.resistant_phenotype == "Resistant") |>
     dplyr::group_by(genome_drug.genome_id) |>
     dplyr::summarise(
       num_resistant_classes = dplyr::n_distinct(drug_class),
       resistant_classes = paste(unique(class_abbr), collapse = "_")
     )
-  
+
   year_breaks <- seq(1980, 2023, by = 5)
-  
+
   dplyr::tbl(con, "filtered") |>
     tibble::as_tibble() |>
     dplyr::mutate(genome_drug.antibiotic = cleaned_drug) |>
     dplyr::select(-cleaned_drug) |>
     dplyr::left_join(clean_countries, by = c("genome.isolation_country" = "raw_entry")) |>
-    dplyr::rename("cleaned_country"="clean_name", "country_abbr"="short_name") |>
+    dplyr::rename("cleaned_country" = "clean_name", "country_abbr" = "short_name") |>
     dplyr::mutate(genome.isolation_country = cleaned_country) |>
     dplyr::select(-cleaned_country) |>
     dplyr::left_join(resistance_summary, by = "genome_drug.genome_id") |>
@@ -1301,25 +1334,29 @@ cleanMetaData <- function(duckdb_path, path, ref_file_path = "data_raw/"){
       is.na(resistant_classes) ~ genome_drug.resistant_phenotype,
       TRUE ~ resistant_classes
     )) |>
-    dplyr::mutate(num_resistant_classes= dplyr::case_when(
+    dplyr::mutate(num_resistant_classes = dplyr::case_when(
       is.na(num_resistant_classes) ~ 0,
       TRUE ~ num_resistant_classes
     )) |>
     dplyr::mutate(genome.collection_year = as.numeric(genome.collection_year)) |>
-    dplyr::mutate(year_bin = cut(genome.collection_year, breaks = year_breaks,
-                                 right = FALSE, include.lowest = TRUE,
-                                 labels = paste(year_breaks[-length(year_breaks)],
-                                                year_breaks[-1] - 1, sep = "-"))) |>
+    dplyr::mutate(year_bin = cut(genome.collection_year,
+      breaks = year_breaks,
+      right = FALSE, include.lowest = TRUE,
+      labels = paste(year_breaks[-length(year_breaks)],
+        year_breaks[-1] - 1,
+        sep = "-"
+      )
+    )) |>
     DBI::dbWriteTable(conn = con, name = "cleaned_metadata", overwrite = TRUE)
-  
+
   # Parquet output path
-  metadata_parquet <- file.path(path, "metadata.parquet")  # cleaned_metadata exported as 'metadata'
+  metadata_parquet <- file.path(path, "metadata.parquet") # cleaned_metadata exported as 'metadata'
 
   # Also export AMR/genome/original metadata
-  amr_phenotype_parquet          <- file.path(path, "amr_phenotype.parquet")
-  genome_data_parquet            <- file.path(path, "genome_data.parquet")
-  original_metadata_parquet      <- file.path(path, "original_metadata.parquet")
-  
+  amr_phenotype_parquet <- file.path(path, "amr_phenotype.parquet")
+  genome_data_parquet <- file.path(path, "genome_data.parquet")
+  original_metadata_parquet <- file.path(path, "original_metadata.parquet")
+
   writeCompressedParquet <- function(df, path) {
     arrow::write_parquet(
       df,
@@ -1329,38 +1366,40 @@ cleanMetaData <- function(duckdb_path, path, ref_file_path = "data_raw/"){
       use_dictionary = TRUE
     )
   }
-  
-  db_name <- duckdb_path |> stringr::str_split_i(".duckdb", i = 1) |> paste0("_parquet.duckdb")
+
+  db_name <- duckdb_path |>
+    stringr::str_split_i(".duckdb", i = 1) |>
+    paste0("_parquet.duckdb")
   con_new <- DBI::dbConnect(duckdb::duckdb(), db_name)
   on.exit(try(DBI::dbDisconnect(con_new, shutdown = FALSE), silent = TRUE), add = TRUE)
-  
+
   # cleaned_metadata -> parquet + view (as metadata)
   DBI::dbReadTable(con, "cleaned_metadata") |> writeCompressedParquet(metadata_parquet)
   DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW metadata AS SELECT * FROM read_parquet('%s')", metadata_parquet))
-  
+
   # debug/complete views: amr_phenotype, genome_data, original_metadata
   DBI::dbReadTable(con, "amr_phenotype") |> writeCompressedParquet(amr_phenotype_parquet)
-  DBI::dbReadTable(con, "genome_data")   |> writeCompressedParquet(genome_data_parquet)
-  DBI::dbReadTable(con, "metadata")      |> writeCompressedParquet(original_metadata_parquet)
-  
+  DBI::dbReadTable(con, "genome_data") |> writeCompressedParquet(genome_data_parquet)
+  DBI::dbReadTable(con, "metadata") |> writeCompressedParquet(original_metadata_parquet)
+
   DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW amr_phenotype AS SELECT * FROM read_parquet('%s')", amr_phenotype_parquet))
   DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW genome_data AS SELECT * FROM read_parquet('%s')", genome_data_parquet))
   DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW original_metadata AS SELECT * FROM read_parquet('%s')", original_metadata_parquet))
-  
+
   invisible(TRUE)
 }
 
 # Clean feature matrices, then save as Parquet files
 #'
-#' @param duckdb_path 
-#' @param path 
+#' @param duckdb_path
+#' @param path
 #'
 #' @returns
 #' @export
 #'
 #' @examples
-cleanData <- function(duckdb_path, path){
-  duckdb_path  <- normalizePath(duckdb_path)
+cleanData <- function(duckdb_path, path) {
+  duckdb_path <- normalizePath(duckdb_path)
   # If no explicit path is provided (or a generic one), choose results/<bug>/ when
   # the DuckDB lives under data/<bug>/, or else fall back to the DuckDB directory.
   if (missing(path) || path %in% c(".", "results", "results/")) {
@@ -1381,19 +1420,19 @@ cleanData <- function(duckdb_path, path){
   on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE)
 
   # Parquet output paths
-  genes_parquet                  <- file.path(path, "gene_count.parquet")
-  gene_names_parquet             <- file.path(path, "gene_names.parquet")
-  gene_ref_seq_parquet           <- file.path(path, "gene_seqs.parquet")
-  genome_gene_protein_parquet    <- file.path(path, "genome_gene_protein.parquet")
-  struct_parquet                 <- file.path(path, "struct.parquet")
+  genes_parquet <- file.path(path, "gene_count.parquet")
+  gene_names_parquet <- file.path(path, "gene_names.parquet")
+  gene_ref_seq_parquet <- file.path(path, "gene_seqs.parquet")
+  genome_gene_protein_parquet <- file.path(path, "genome_gene_protein.parquet")
+  struct_parquet <- file.path(path, "struct.parquet")
 
-  proteins_parquet               <- file.path(path, "protein_count.parquet")
-  domains_parquet                <- file.path(path, "domain_count.parquet")
+  proteins_parquet <- file.path(path, "protein_count.parquet")
+  domains_parquet <- file.path(path, "domain_count.parquet")
 
-  domain_names_parquet           <- file.path(path, "domain_names.parquet")
-  protein_names_parquet          <- file.path(path, "protein_names.parquet")
+  domain_names_parquet <- file.path(path, "domain_names.parquet")
+  protein_names_parquet <- file.path(path, "protein_names.parquet")
 
-  protein_cluster_seq_parquet    <- file.path(path, "protein_seqs.parquet")
+  protein_cluster_seq_parquet <- file.path(path, "protein_seqs.parquet")
 
   writeCompressedParquet <- function(df, path) {
     arrow::write_parquet(
@@ -1405,7 +1444,9 @@ cleanData <- function(duckdb_path, path){
     )
   }
 
-  db_name <- duckdb_path |> stringr::str_split_i(".duckdb", i = 1) |> paste0("_parquet.duckdb")
+  db_name <- duckdb_path |>
+    stringr::str_split_i(".duckdb", i = 1) |>
+    paste0("_parquet.duckdb")
   con_new <- DBI::dbConnect(duckdb::duckdb(), db_name)
   on.exit(try(DBI::dbDisconnect(con_new, shutdown = FALSE), silent = TRUE), add = TRUE)
 
@@ -1623,7 +1664,7 @@ runDataProcessing <- function(duckdb_path,
                               cdhit_identity = 0.9,
                               cdhit_word_length = 5,
                               cdhit_memory = 0,
-                              cdhit_extra_args = c("-g","1"),
+                              cdhit_extra_args = c("-g", "1"),
                               cdhit_output_prefix = "cdhit_out",
                               # InterPro
                               ipr_appl = c("Pfam"),
@@ -1709,4 +1750,3 @@ runDataProcessing <- function(duckdb_path,
     parquet_duckdb_path = normalizePath(parquet_duckdb_path)
   ))
 }
-
diff --git a/vignettes/intro.Rmd b/vignettes/intro.Rmd
index 59ccbc5..9d9a36f 100644
--- a/vignettes/intro.Rmd
+++ b/vignettes/intro.Rmd
@@ -31,13 +31,15 @@ For metadata curation with `amRdata`, use `retrieveMetadata()` to create a DuckD
 
 ```{r}
 # Download all AMR data for a species from BV-BRC
-retrieveMetadata(user_bacs = "Shigella flexneri",
-                             filter_type = "AMR",
-                             base_dir = "../data/",
-                             abx = "All",
-                             overwrite = FALSE,
-                             image = "danylmb/bvbrc:5.3",
-                             verbose = FALSE)
+retrieveMetadata(
+  user_bacs = "Shigella flexneri",
+  filter_type = "AMR",
+  base_dir = "../data/",
+  abx = "All",
+  overwrite = FALSE,
+  image = "danylmb/bvbrc:5.3",
+  verbose = FALSE
+)
 ```
 
 This wrote tables 'amr_phenotype', 'genome_data', and 'metadata' to a DuckDB
@@ -45,16 +47,18 @@ This wrote tables 'amr_phenotype', 'genome_data', and 'metadata' to a DuckDB
 For downloading genomes with paired AMR phenotype data, use `retrieveGenomes()`
 
 ```{r }
-retrieveGenomes(base_dir = "../data/",
-                            user_bacs = "Shigella flexneri",
-                            method = c("cli"),
-                            image = "danylmb/bvbrc:5.3",
-                            skip_existing = TRUE,
-                            ftp_workers = 8L,
-                            cli_fasta_workers = 4L,
-                            cli_gff_workers = 4L,
-                            chunk_size = 50L,
-                            verbose = TRUE)
+retrieveGenomes(
+  base_dir = "../data/",
+  user_bacs = "Shigella flexneri",
+  method = c("cli"),
+  image = "danylmb/bvbrc:5.3",
+  skip_existing = TRUE,
+  ftp_workers = 8L,
+  cli_fasta_workers = 4L,
+  cli_gff_workers = 4L,
+  chunk_size = 50L,
+  verbose = TRUE
+)
 ```
 
 This returns character vector of genome IDs and wrote complete file sets on disk.
@@ -62,10 +66,11 @@ This returns character vector of genome IDs and wrote complete file sets on disk
 To write a .txt file listing downloaded genome filepaths (.fna, .faa, .gff)
 
 ```{r }
-genomeList(base_dir = "../data/",
-                       user_bacs = "Shigella flexneri",
-                       verbose = TRUE)
-
+genomeList(
+  base_dir = "../data/",
+  user_bacs = "Shigella flexneri",
+  verbose = TRUE
+)
 ```
 
 #### A wrapper for downloading genomes and listing the paths
@@ -77,11 +82,13 @@ Internally runs:
 Allows users to input a species or taxon ID and automate all data downloading and curation steps.
 
 ```{r }
-prepareGenomes(user_bacs = "Shigella flexneri",
-                           base_dir = "../data/",
-                           method = c("cli"),
-                           overwrite = FALSE,
-                           verbose = TRUE)
+prepareGenomes(
+  user_bacs = "Shigella flexneri",
+  base_dir = "../data/",
+  method = c("cli"),
+  overwrite = FALSE,
+  verbose = TRUE
+)
 ```
 
 ## Data Processing
@@ -95,32 +102,34 @@ Internally runs:
 4. `cleanData()` -\> Clean metadata and export the feature tables to Parquet + Parquet-backed DuckDB
 
 ```{r}
-runDataProcessing(duckdb_path = "../data/Shigella_flexneri/Sfl.duckdb",
-                              output_path = NULL,
-                              # unified threads for all tools
-                              threads = 16,
-                              # Panaroo
-                              panaroo_split_jobs = FALSE,
-                              panaroo_core_threshold = 0.90,
-                              panaroo_len_dif_percent = 0.95,
-                              panaroo_cluster_threshold = 0.95,
-                              panaroo_family_seq_identity = 0.5,
-                              # CD-HIT
-                              cdhit_identity = 0.9,
-                              cdhit_word_length = 5,
-                              cdhit_memory = 0,
-                              cdhit_extra_args = c("-g","1"),
-                              cdhit_output_prefix = "cdhit_out",
-                              # InterPro
-                              ipr_appl = c("Pfam"),
-                              ipr_threads_unused = NULL,
-                              ipr_version = "5.76-107.0",
-                              ipr_dest_dir = "inst/extdata/interpro",
-                              ipr_platform = "linux/amd64",
-                              auto_prepare_data = TRUE,
-                              # Metadata cleaning
-                              ref_file_path = "../data_raw/",
-                              verbose = TRUE)
+runDataProcessing(
+  duckdb_path = "../data/Shigella_flexneri/Sfl.duckdb",
+  output_path = NULL,
+  # unified threads for all tools
+  threads = 16,
+  # Panaroo
+  panaroo_split_jobs = FALSE,
+  panaroo_core_threshold = 0.90,
+  panaroo_len_dif_percent = 0.95,
+  panaroo_cluster_threshold = 0.95,
+  panaroo_family_seq_identity = 0.5,
+  # CD-HIT
+  cdhit_identity = 0.9,
+  cdhit_word_length = 5,
+  cdhit_memory = 0,
+  cdhit_extra_args = c("-g", "1"),
+  cdhit_output_prefix = "cdhit_out",
+  # InterPro
+  ipr_appl = c("Pfam"),
+  ipr_threads_unused = NULL,
+  ipr_version = "5.76-107.0",
+  ipr_dest_dir = "inst/extdata/interpro",
+  ipr_platform = "linux/amd64",
+  auto_prepare_data = TRUE,
+  # Metadata cleaning
+  ref_file_path = "../data_raw/",
+  verbose = TRUE
+)
 ```
 
 ## Plots
@@ -128,8 +137,10 @@ runDataProcessing(duckdb_path = "../data/Shigella_flexneri/Sfl.duckdb",
 Simple stats and plots to explore metadata
 
 ```{r }
-generateSummary("data/metadata_parquet", 
-                            out_path = "data/")
-generatePlots("data/metadata_parquet", 
-                            out_path = "data/")
+generateSummary("data/metadata_parquet",
+  out_path = "data/"
+)
+generatePlots("data/metadata_parquet",
+  out_path = "data/"
+)
 ```

From 4edc964d748d09cb7cbf911fd2200db44865c230 Mon Sep 17 00:00:00 2001
From: Evan Pierce Brenner <108823789+epbrenner@users.noreply.github.com>
Date: Wed, 25 Feb 2026 15:15:23 -0700
Subject: [PATCH 3/4] Updating download logic

Fixed trailing zero bug, fixed FTP timeout bug (?), fixed empty files hanging downloads, fixed imbalanced genome data sets (e.g., no .fna, yes .faa, yes .gff)
---
 R/data_curation.R | 816 +++++++++++++++++++++++++++++-----------------
 1 file changed, 511 insertions(+), 305 deletions(-)

diff --git a/R/data_curation.R b/R/data_curation.R
index 0d19113..ac5095b 100644
--- a/R/data_curation.R
+++ b/R/data_curation.R
@@ -1,3 +1,166 @@
+#' Helps ensure trailing 0s are retained in genome IDs for proper downloading
+#' @keywords internal
+.id_checker <- function(x) {
+  # Taxon IDs are just numbers, genome IDs have decimals, this tells them apart
+  grepl("^[0-9]+$", x)
+}
+
+#' Helps tag genomes with their AMR evidence for parsing
+#' @keywords internal
+.create_amr_tagged_view <- function(con) {
+  lab_methods  <- c("Disk diffusion", "MIC", "Broth dilution", "Agar dilution")
+  lab_list_sql <- paste(DBI::dbQuoteString(con, lab_methods), collapse = ", ")
+  comp_str_sql <- DBI::dbQuoteString(con, "Computational Method")
+
+  DBI::dbExecute(
+    con,
+    sprintf(
+      "CREATE OR REPLACE VIEW amr_phenotype_tagged AS
+       SELECT
+         *,
+         CASE WHEN \"genome_drug.laboratory_typing_method\" IN (%s) THEN 1 ELSE 0 END AS is_lab_row,
+         CASE WHEN
+           (\"genome_drug.evidence\" = %s)
+           OR (COALESCE(\"genome_drug.computational_method\", '') <> '')
+           OR (\"genome_drug.laboratory_typing_method\" = 'Computational Prediction')
+         THEN 1 ELSE 0 END AS is_comp_row
+       FROM amr_phenotype",
+      lab_list_sql, comp_str_sql
+    )
+  )
+}
+
+#' Helps appropriately interface with BV-BRC FTPS server, and avoids getting stuck
+#' when malformed files can hang an FTPS connection by introducing safeguards
+#' @keywords internal
+.ftpes_download_one <- function(genomeID, out_dir,
+                                connect_timeout = 10L,
+                                max_time        = 30L,
+                                speed_time      = 30L,    # end if <speed_limit for >speed_time
+                                speed_limit     = 2048L,  # B/s
+                                min_bytes       = 100L) {
+
+  dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
+
+  exts <- c(".fna", ".PATRIC.faa", ".PATRIC.gff")
+  for (ext in exts) {
+    dest     <- file.path(out_dir, paste0(genomeID, ext))
+    dest_tmp <- paste0(dest, ".tmp")
+
+    # Skip if we already completed these
+    if (file.exists(dest) && file.info(dest)$size > min_bytes) next
+    if (file.exists(dest) && file.info(dest)$size == 0) try(unlink(dest), silent = TRUE)
+    if (file.exists(dest_tmp)) try(unlink(dest_tmp), silent = TRUE)
+
+    url <- sprintf("ftp://ftp.bv-brc.org/genomes/%s/%s%s", genomeID, genomeID, ext)
+
+    args <- c(
+      "--fail", "--silent", "--show-error", "-L",
+      "--connect-timeout", as.character(connect_timeout),
+      "--max-time",        as.character(max_time),
+      "--speed-time",      as.character(speed_time),
+      "--speed-limit",     as.character(speed_limit),
+      "--ftp-ssl",                 # AUTH TLS on port 21 works in testing
+      "--ftp-pasv", "--disable-epsv", "--ipv4",
+      "--user", "anonymous:",
+      "-o", shQuote(dest_tmp), shQuote(url)
+    )
+
+    res <- suppressWarnings(system2("curl", args = args, stdout = TRUE, stderr = TRUE))
+    status <- attr(res, "status"); if (is.null(status)) status <- 0L
+
+    # Avoiding 0B files cluttering up results -- atomic rename on successful tmp DL
+    ok <- (status == 0L && file.exists(dest_tmp) && file.info(dest_tmp)$size > min_bytes)
+    if (ok) {
+      if (!file.rename(dest_tmp, dest)) {
+        # If rename fails for some reason, copy + unlink
+        file.copy(dest_tmp, dest, overwrite = TRUE)
+        unlink(dest_tmp)
+      }
+    } else {
+      try(unlink(dest_tmp), silent = TRUE)
+      return(FALSE)  # Abort early, don't accept partial set
+    }
+  }
+
+  # Do we have all 3 present?
+  .is_complete_set(out_dir, genomeID, min_bytes = min_bytes)
+}
+
+#' Helps manage FTPS downloading from BV-BRC, tryng a quick download first, and
+#' if that fails, trying a longer timeout 2nd pass at the end in case it was a
+#' hiccup. If 2nd pass fails, log and give up on that file.
+#' @keywords internal
+.ftpes_download_two_pass <- function(genome_ids, out_dir,
+                                     workers_first  = 4L,
+                                     workers_second = 4L,
+                                     log_file       = NULL) {
+  genome_ids <- unique(as.character(genome_ids))
+  if (!length(genome_ids)) return(character(0))
+
+  if (!is.null(log_file)) {
+    dir.create(dirname(log_file), recursive = TRUE, showWarnings = FALSE)
+    cat(sprintf("[%s] FTPS run start: %d genomes\n", Sys.time(), length(genome_ids)),
+        file = log_file, append = TRUE)
+  }
+
+  # Pass 1: 30s per-file cap
+  message("FTPS pass 1 (30s timeout)")
+  future::plan(future::multisession, workers = max(1, workers_first))
+  res1 <- future.apply::future_lapply(
+    genome_ids,
+    function(gid) {
+      ok <- .ftpes_download_one(gid, out_dir,
+                                connect_timeout = 10L, max_time = 30L,
+                                speed_time = 30L, speed_limit = 2048L)
+      list(gid = gid, ok = ok)
+    },
+    future.seed = TRUE
+  )
+  ok1 <- vapply(res1, `[[`, logical(1), "ok")
+  ok_ids_1 <- genome_ids[ok1]
+  fail_ids <- genome_ids[!ok1]
+  message(sprintf("Pass 1: ok=%d, fail=%d", length(ok_ids_1), length(fail_ids)))
+  if (!is.null(log_file)) {
+    cat(sprintf("[%s] Pass1 ok=%d fail=%d\n", Sys.time(), length(ok_ids_1), length(fail_ids)),
+        file = log_file, append = TRUE)
+  }
+
+  if (!length(fail_ids)) {
+    if (!is.null(log_file)) cat(sprintf("[%s] FTPS run end: all OK\n", Sys.time()),
+                                file = log_file, append = TRUE)
+    return(ok_ids_1)
+  }
+
+  # Pass 2: 60s per-file cap where we retry any failures
+  message("FTPS pass 2 (60s timeout) for failed genomes")
+  future::plan(future::multisession, workers = max(1, workers_second))
+  res2 <- future.apply::future_lapply(
+    fail_ids,
+    function(gid) {
+      ok <- .ftpes_download_one(gid, out_dir,
+                                connect_timeout = 10L, max_time = 60L,
+                                speed_time = 30L, speed_limit = 2048L)
+      list(gid = gid, ok = ok)
+    },
+    future.seed = TRUE
+  )
+  ok2 <- vapply(res2, `[[`, logical(1), "ok")
+  ok_ids_2   <- fail_ids[ok2]
+  still_fail <- setdiff(fail_ids, ok_ids_2)
+  message(sprintf("Pass 2: ok=%d, still_fail=%d", length(ok_ids_2), length(still_fail)))
+  if (!is.null(log_file)) {
+    cat(sprintf("[%s] Pass2 ok=%d still_fail=%d\n", Sys.time(), length(ok_ids_2), length(still_fail)),
+        file = log_file, append = TRUE)
+    if (length(still_fail)) {
+      cat("Fail IDs (excluded): ", paste(head(still_fail, 50), collapse = ", "), "\n",
+          file = log_file, append = TRUE)
+    }
+  }
+
+  unique(c(ok_ids_1, ok_ids_2))
+}
+
 #' Fetch BV-BRC bacterial genome data
 #'
 #' Run the BV-BRC CLI (`p3-all-genomes`) inside a Docker image and return results as a tibble.
@@ -26,7 +189,8 @@
     "--in superkingdom,Bacteria ",
     "--eq genome_quality,Good ",
     "--in genome_status,WGS,Complete ",
-    "--attr genome_id,genome_name,taxon_id,species,strain,_version_,",
+    "--attr genome_id,genome_name,genome_quality,genome_status,",
+    "taxon_id,species,strain,_version_,",
     "collection_year,state_province,latitude,longitude,",
     "antimicrobial_resistance_evidence,assembly_accession,isolation_country,",
     "isolation_source,disease,host_common_name"
@@ -44,10 +208,10 @@
     stop("BV-BRC returned no data. The service may be unavailable.")
   }
 
-  # Parse
+  # Parse and coerce to character to avoid numeric interpretation losing trailing 0s
   df <- utils::read.table(
     text = raw_data, sep = "\t", header = TRUE, fill = TRUE,
-    quote = "", check.names = FALSE, comment.char = ""
+    quote = "", check.names = FALSE, comment.char = "", colClasses = "character"
   )
   df <- tibble::as_tibble(df) |>
     dplyr::mutate(dplyr::across(dplyr::everything(), ~ iconv(.x, from = "", to = "UTF-8", sub = "")))
@@ -230,7 +394,7 @@
   )
 
   for (user_bac in user_bacs) {
-    if (suppressWarnings(!is.na(as.numeric(user_bac)))) {
+    if (.id_checker(user_bac)) {
       message("Numeric input detected: ", user_bac)
       if (user_bac %in% bvbrc_bacs$genome.taxon_id) {
         bac_name <- bvbrc_bacs$genome.species[bvbrc_bacs$genome.taxon_id == user_bac]
@@ -294,10 +458,11 @@
     return(cand)
   }
   if (query_type == "taxon_id") {
-    nums <- suppressWarnings(!is.na(as.numeric(user_bacs)))
+    nums <- .id_checker(user_bacs)
     if (!any(nums)) stop("Cannot infer taxon_id from user_bacs. Provide query_value.")
     return(as.character(user_bacs[which(nums)[1]]))
   }
+
   stop("Unsupported query_type: ", query_type)
 }
 
@@ -318,12 +483,12 @@
 #' @examples
 #' .generateDBname(c("90371", "Bacillus subtilis"))
 #' .generateDBname(c("12345", "Escherichia coli", "Lactobacillus"))
-#'
+#' @keywords internal
 .generateDBname <- function(user_bacs) {
   db_parts <- c()
 
   for (user_bac in user_bacs) {
-    if (suppressWarnings(!is.na(as.numeric(user_bac)))) {
+    if (.id_checker(user_bac)) {
       db_parts <- c(db_parts, paste0("tid_", user_bac))
     } else {
       parts <- stringr::str_split(user_bac, " ")[[1]]
@@ -437,7 +602,7 @@
   data_result <- tibble::as_tibble(
     utils::read.table(
       text = data_raw, sep = "\t", header = TRUE, fill = TRUE,
-      quote = "", check.names = FALSE, comment.char = ""
+      quote = "", check.names = FALSE, comment.char = "", colClasses = "character"
     )
   ) |>
     dplyr::mutate(
@@ -462,17 +627,11 @@
 
 #' Retrieve genome IDs for each taxon via BV-BRC and DuckDB
 #'
-#' Resolves user-provided taxa to taxon IDs, queries BV-BRC per unique taxon ID,
-#' and returns distinct genome IDs. Uses a per-selection DuckDB located under:
-#'   <base_dir>/data/<bug_dir>/<abbrev>.duckdb
-#' BV-BRC column names are preserved.
-#'
-#' @param base_dir Character. Project root directory. Default ".".
-#' @param user_bacs Character vector. Mixed inputs of taxon IDs and/or species names.
-#' @param overwrite Logical. If FALSE and the DuckDB already exists for this selection, abort. Default: FALSE.
-#' @param verbose Logical.
+#' Resolves user-provided taxa to taxon IDs using the local BV-BRC cache, then
+#' returns distinct genome IDs (Good + WGS/Complete). Also writes a 'bac_data'
+#' table to the per-selection DuckDB with a cache snapshot of core fields.
 #'
-#' @return A numeric vector of distinct `genome.genome_id`, or NULL if none found.
+#' @return A character vector of distinct `genome.genome_id`, or NULL if none found.
 .retrieveQueryIDs <- function(base_dir = ".",
                               user_bacs,
                               overwrite = FALSE,
@@ -481,60 +640,68 @@
 
   if (isTRUE(verbose)) message("Resolving input taxa.")
   bac_input_data <- .retrieveCustomQuery(base_dir = base_dir, user_bacs = user_bacs)
-
   if (is.null(bac_input_data) || nrow(bac_input_data) == 0) {
     message("No valid input provided or no matches found.")
     return(NULL)
   }
 
-  # Resolve per-bug DB path
-  paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite)
+  # Per-selection DB path
+  paths   <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite)
   db_path <- paths$db_path
 
-  bac_data <- tibble::tibble()
-  taxon_ids <- unique(bac_input_data$genome.taxon_id)
-
-  if (isTRUE(verbose)) message("Querying BV-BRC for ", length(taxon_ids), " taxon IDs.")
-
-  for (i in seq_along(taxon_ids)) {
-    tax <- taxon_ids[i]
-    if (isTRUE(verbose)) message("Taxon ", i, "/", length(taxon_ids), ": ", tax)
-
-    res <- .getGenomeIDs(
-      base_dir    = base_dir,
-      query_type  = "taxon_id",
-      query_value = as.character(tax),
-      user_bacs   = user_bacs,
-      overwrite   = TRUE,
-      verbose     = verbose
-    )
-
-    con <- res$duckdbConnection
-    tbl <- res$table_name
-    each_bac_data <- tibble::as_tibble(DBI::dbReadTable(con, tbl))
-    bac_data <- dplyr::bind_rows(bac_data, each_bac_data)
-
-    # Close per-iteration connection to avoid open handles piling up
-    try(DBI::dbDisconnect(con, shutdown = TRUE), silent = TRUE)
+  # Got a cache? Use that, it's fast
+  cache_db <- file.path(base_dir, "data", "bvbrc", "bvbrcData.duckdb")
+  if (!file.exists(cache_db)) {
+    stop("BV-BRC cache not found at: ", cache_db, ". Run .updateBVBRCdata() first.")
   }
+  con_cache <- DBI::dbConnect(duckdb::duckdb(), dbdir = cache_db, read_only = TRUE)
+  on.exit(try(DBI::dbDisconnect(con_cache, shutdown = TRUE), silent = TRUE), add = TRUE)
 
-  if (nrow(bac_data) > 0) {
-    genome_ids <- bac_data |>
-      dplyr::distinct(`genome.genome_id`) |>
-      dplyr::pull(`genome.genome_id`)
-    genome_ids <- genome_ids[!is.na(genome_ids)]
+  taxon_ids <- unique(bac_input_data$genome.taxon_id)
+  if (isTRUE(verbose)) message("Querying cache for ", length(taxon_ids), " taxon IDs.")
+
+  taxon_list_sql <- paste(DBI::dbQuoteString(con_cache, taxon_ids), collapse = ", ")
+  sql_ids <- sprintf(
+    "SELECT DISTINCT
+       \"genome.genome_id\"   AS gid,
+       \"genome.genome_name\" AS genome_name,
+       \"genome.taxon_id\"    AS taxon_id,
+       \"genome.species\"     AS species,
+       \"genome.strain\"      AS strain
+     FROM bvbrc_bac_data
+     WHERE \"genome.taxon_id\" IN (%s)
+       AND \"genome.genome_quality\" = 'Good'
+       AND \"genome.genome_status\"  IN ('WGS','Complete')",
+    taxon_list_sql
+  )
+  cache_rows <- DBI::dbGetQuery(con_cache, sql_ids)
+  valid_id_re <- "^[0-9]+\\.[0-9]+$"
+  cache_rows <- cache_rows[grepl(valid_id_re, cache_rows$gid), , drop = FALSE]
 
-    if (length(genome_ids) > 0) {
-      if (isTRUE(verbose)) message("Collected ", length(genome_ids), " distinct genome IDs.")
-      return(genome_ids)
-    } else {
-      message("No valid genome IDs found.")
-      return(NULL)
-    }
-  } else {
-    message("No valid genome data found for any input.")
+  if (nrow(cache_rows) == 0L) {
+    message("No valid genome IDs found.")
     return(NULL)
   }
+
+  genome_ids <- unique(cache_rows$gid)
+  if (isTRUE(verbose)) message("Collected ", length(genome_ids), " distinct genome IDs (cache).")
+
+  # Write 'bac_data' from cache
+  con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path)
+  on.exit(try(DBI::dbDisconnect(con, shutdown = TRUE), silent = TRUE), add = TRUE)
+
+  bac_data_tbl <- data.frame(
+    `genome.genome_id`   = cache_rows$gid,
+    `genome.genome_name` = cache_rows$genome_name,
+    `genome.taxon_id`    = cache_rows$taxon_id,
+    `genome.species`     = cache_rows$species,
+    `genome.strain`      = cache_rows$strain,
+    stringsAsFactors = FALSE
+  )
+  DBI::dbWriteTable(con, "bac_data", bac_data_tbl, overwrite = TRUE)
+  if (isTRUE(verbose)) message("Wrote table 'bac_data' to: ", db_path)
+
+  genome_ids
 }
 
 #' Extract AMR Data Table
@@ -685,7 +852,7 @@
   writeLines(genome_ids_output, con = tmp_in)
   tmp_in_mounted <- file.path("/data", "tmp", basename(tmp_in))
 
-  # Choose attributes (AMR for this pipeline)
+  # Choose attributes (AMR for this workflow)
   chosen_fields <- if (identical(filter_type, "AMR")) amr_fields else microtrait_fields
 
   get_args <- c(
@@ -707,7 +874,6 @@
   return(genome_data)
 }
 
-
 #' Retrieve AMR or Microtrait metadata from BV-BRC and store in DuckDB
 #'
 #' Queries BV-BRC for AMR or microtrait metadata for genomes corresponding to user inputs.
@@ -716,7 +882,7 @@
 #' Tables written:
 #'   - amr_phenotype
 #'   - genome_data
-#'   - metadata (inner join on genome ID field names as returned by BV-BRC)
+#'   - metadata (join on genome IDs returned by BV-BRC)
 #'
 #' @param user_bacs Character vector. Mixed taxon IDs and/or species strings (used for naming).
 #' @param filter_type Character. "AMR" or "microTraits". Default "AMR".
@@ -748,7 +914,7 @@ retrieveMetadata <- function(user_bacs,
     return(NULL)
   }
 
-  # Fields (unchanged BV-BRC names)
+  # Desired fields from BV-BRC
   drug_fields <- paste0(
     "antibiotic,computational_method,",
     "evidence,genome_name,id,",
@@ -798,7 +964,7 @@ retrieveMetadata <- function(user_bacs,
     "trna"
   )
 
-  # Batching
+  # Batching downloads and parallel implementation
   batch_size <- 500L
   genome_ids <- as.character(genome_ids)
   genome_batches <- split(genome_ids, ceiling(seq_along(genome_ids) / batch_size))
@@ -818,10 +984,10 @@ retrieveMetadata <- function(user_bacs,
     envir = environment()
   )
   parallel::clusterEvalQ(cluster, {
-    library(tibble)
-    library(dplyr)
+    library(tibble); library(dplyr)
   })
 
+  # Pull AMR metadata
   if (isTRUE(verbose)) message("Retrieving AMR phenotype data in batches.")
   batch_drug_data <- parallel::parLapply(cluster, genome_batches, function(batch) {
     .extractAMRtable(
@@ -838,17 +1004,15 @@ retrieveMetadata <- function(user_bacs,
     message("No drug data returned.")
     return(NULL)
   }
-
-  combined_drug_data_tbl <- tibble::as_tibble(
-    utils::read.table(
-      text = combined_drug_data,
-      sep = "\t", header = TRUE, fill = TRUE,
-      quote = "", check.names = FALSE, comment.char = ""
-    )
-  ) |>
+  combined_drug_data_tbl <- tibble::as_tibble(utils::read.table(
+    text = combined_drug_data,
+    sep = "\t", header = TRUE, fill = TRUE,
+    quote = "", check.names = FALSE, comment.char = "", colClasses = "character"
+  )) |>
     dplyr::mutate(dplyr::across(dplyr::everything(), ~ iconv(.x, from = "", to = "UTF-8", sub = ""))) |>
     dplyr::mutate(`genome_drug.genome_id` = as.character(`genome_drug.genome_id`))
 
+  # Pull genome metadata
   if (isTRUE(verbose)) message("Retrieving genome metadata in batches.")
   batch_genome_data <- parallel::parLapply(cluster, genome_batches, function(batch) {
     .extractGenomeData(
@@ -866,49 +1030,81 @@ retrieveMetadata <- function(user_bacs,
     message("No genome data returned.")
     return(NULL)
   }
-
-  combined_genome_data_tbl <- tibble::as_tibble(
-    utils::read.table(
-      text = combined_genome_data,
-      sep = "\t", header = TRUE, fill = TRUE,
-      quote = "", check.names = FALSE, comment.char = ""
-    )
-  ) |>
+  combined_genome_data_tbl <- tibble::as_tibble(utils::read.table(
+    text = combined_genome_data,
+    sep = "\t", header = TRUE, fill = TRUE,
+    quote = "", check.names = FALSE, comment.char = "", colClasses = "character"
+  )) |>
     dplyr::mutate(dplyr::across(dplyr::everything(), ~ iconv(.x, from = "", to = "UTF-8", sub = ""))) |>
     dplyr::mutate(`genome.genome_id` = as.character(`genome.genome_id`))
 
-  # Per-bug DB path
-  paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite)
+  # Write & join
+  paths   <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite)
   db_path <- paths$db_path
-
   logs_dir <- file.path(base_dir, "data", "logs")
   dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE)
   cat(sprintf("[%s] Writing metadata DuckDB: %s\n", Sys.time(), db_path),
-    file = file.path(logs_dir, "bvbrc.log"), append = TRUE
-  )
+      file = file.path(logs_dir, "bvbrc.log"), append = TRUE)
 
   con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path)
+  on.exit(try(DBI::dbDisconnect(con, shutdown = TRUE), silent = TRUE), add = TRUE)
+
   DBI::dbWriteTable(con, "amr_phenotype", combined_drug_data_tbl, overwrite = TRUE)
-  DBI::dbWriteTable(con, "genome_data", combined_genome_data_tbl, overwrite = TRUE)
+  DBI::dbWriteTable(con, "genome_data",  combined_genome_data_tbl, overwrite = TRUE)
 
   if (isTRUE(verbose)) message("Joining AMR phenotype and genome metadata.")
-  initial_metadata <- tibble::as_tibble(DBI::dbGetQuery(
+  DBI::dbExecute(con, '
+    CREATE OR REPLACE TABLE metadata AS
+    SELECT *
+    FROM amr_phenotype
+    INNER JOIN genome_data
+      ON amr_phenotype."genome_drug.genome_id" = genome_data."genome.genome_id"
+  ')
+
+  # Debug summary after writes
+  n_targets   <- length(genome_ids)
+  n_amr_ids   <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n
+  n_gmeta_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id")      AS n FROM genome_data')$n
+  if (isTRUE(verbose)) {
+    message("Initial summary: targets=", n_targets,
+            " | AMR genomes=", n_amr_ids,
+            " | genome_data genomes=", n_gmeta_ids)
+  }
+
+  # Tagged view
+  .create_amr_tagged_view(con)
+
+  # Final debug summary
+  n_bac <- if ("bac_data" %in% DBI::dbListTables(con))
+    DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM bac_data')$n else NA_integer_
+  n_amr_ids   <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n
+  n_gmeta_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id")      AS n FROM genome_data')$n
+
+  ids_zero_amr <- DBI::dbGetQuery(
     con,
-    'SELECT *
-     FROM amr_phenotype
-     INNER JOIN genome_data
-     ON amr_phenotype."genome_drug.genome_id" = genome_data."genome.genome_id"'
-  ))
-  DBI::dbWriteTable(con, "metadata", initial_metadata, overwrite = TRUE)
+    'WITH sel AS (SELECT DISTINCT "genome.genome_id" AS gid FROM genome_data),
+          got AS (SELECT DISTINCT "genome_drug.genome_id" AS gid FROM amr_phenotype)
+     SELECT sel.gid
+     FROM sel LEFT JOIN got USING (gid)
+     WHERE got.gid IS NULL
+     ORDER BY sel.gid'
+  )$gid
 
   if (isTRUE(verbose)) {
-    message("Wrote tables 'amr_phenotype', 'genome_data', and 'metadata' to: ", db_path)
+    message("Final summary:")
+    message("  targets      : ", length(genome_ids))
+    message("  bac_data     : ", n_bac)
+    message("  genome_data  : ", n_gmeta_ids)
+    message("  amr_phenotype: ", n_amr_ids)
+    message("  genomes with 0 AMR rows: ", length(ids_zero_amr))
+    if (length(ids_zero_amr)) {
+      message("  e.g.: ", paste(utils::head(ids_zero_amr, 10), collapse = ", "))
+    }
   }
 
   list(duckdbConnection = con, table_name = "metadata")
 }
 
-
 # FASTA sanitizer to ensure Panaroo compatibility with BV-BRC CLI downloads
 .strip_fasta_preamble <- function(fna_path) {
   if (!file.exists(fna_path)) {
@@ -966,37 +1162,70 @@ retrieveMetadata <- function(user_bacs,
 #' derive genome IDs from user_bacs (taxon IDs or species substring), and
 #' write a minimal "filtered" table (without AMR evidence filtering).
 #'
+#' @param evidence_mode Character. Either "lab_only" (default), "lab_or_comp" (all),
+#' "comp_only" (BV-BRC-predicted without lab labels), or "any" (no AMR data required).
+#' @return A list with a DuckDB connection and table_name = "filtered"
+#' Filter genomes by AMR phenotype and metadata, and store results in DuckDB
+#'
+#' Preferred path: use per-selection DB "metadata" table (from retrieveMetadata()) and
+#' apply evidence & genome_quality filters.
+#'
+#' Fallback path: if "metadata" is missing and fallback_to_bvbrc_cache = TRUE,
+#' read BV-BRC cache at <base_dir>/data/bvbrc/bvbrcData.duckdb ("bvbrc_bac_data"),
+#' derive genome IDs from user_bacs (taxon IDs or species substring), and
+#' write a minimal "filtered" table (without AMR evidence filtering).
+#'
+#' @param evidence_mode Character. One of:
+#'   "lab_only"   (default) -> only laboratory evidence
+#'   "lab_or_comp"          -> laboratory OR computational evidence
+#'   "comp_only"            -> only computational evidence
+#'   "any"                  -> no AMR required (from genome_data; Good only)
 #' @return A list with a DuckDB connection and table_name = "filtered"
 .filterGenomes <- function(user_bacs,
                            base_dir = ".",
+                           evidence_mode = c("lab_only","lab_or_comp","comp_only","any"),
                            verbose = TRUE,
                            fallback_to_bvbrc_cache = TRUE) {
+  evidence_mode <- match.arg(evidence_mode)
   base_dir <- normalizePath(base_dir, mustWork = FALSE)
-  paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs)
-  db_path <- paths$db_path
+  paths    <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs)
+  db_path  <- paths$db_path
 
   con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path)
+  on.exit({ NULL }, add = TRUE)  # keep open for caller
 
-  on.exit(
-    {
-      # Keep connection open for caller
-      NULL
-    },
-    add = TRUE
-  )
-
-  # Good metadata present? Apply AMR filters
+  # The convenient "Metadata Exists" path
   if ("metadata" %in% DBI::dbListTables(con)) {
     if (isTRUE(verbose)) message("Loading metadata for filtering.")
-    initial_metadata <- DBI::dbReadTable(con, "metadata")
-    if (is.null(initial_metadata) || nrow(initial_metadata) == 0) {
+
+    # If "any", AMR data not needed, just fetch qualifying genome_data
+    if (evidence_mode == "any") {
+      gd <- DBI::dbReadTable(con, "genome_data")
+      if (is.null(gd) || nrow(gd) == 0) {
+        DBI::dbDisconnect(con, shutdown = TRUE)
+        stop("No data available in 'genome_data'.")
+      }
+      gd <- tibble::as_tibble(gd) |>
+        dplyr::filter(`genome.genome_quality` == "Good") |>
+        dplyr::distinct(`genome.genome_id`)
+      # Keep BV-BRC column name
+      DBI::dbWriteTable(con, "filtered", gd, overwrite = TRUE)
+      if (isTRUE(verbose)) {
+        message("Post-filter distinct genomes (any): ", nrow(gd))
+        message("Wrote table 'filtered' to: ", db_path)
+      }
+      return(list(duckdbConnection = con, table_name = "filtered"))
+    }
+
+    # Otherwise, open up the metadata and start interrogating the AMR data
+    md <- DBI::dbReadTable(con, "metadata")
+    if (is.null(md) || nrow(md) == 0) {
       DBI::dbDisconnect(con, shutdown = TRUE)
       message("No data available in 'metadata'.")
       return(NULL)
     }
 
-    # Normalize evidence labels
-    initial_metadata <- tibble::as_tibble(initial_metadata) |>
+    md <- tibble::as_tibble(md) |>
       dplyr::mutate(
         `genome_drug.evidence` = dplyr::case_when(
           `genome_drug.laboratory_typing_method` %in%
@@ -1006,29 +1235,39 @@ retrieveMetadata <- function(user_bacs,
         )
       )
 
-    # AMR and quality filtering
-    filtered_metadata <- initial_metadata |>
-      dplyr::filter(`genome_drug.evidence` == "Laboratory Method") |>
+    if (evidence_mode == "lab_only") {
+      md <- dplyr::filter(md, `genome_drug.evidence` == "Laboratory Method")
+    } else if (evidence_mode == "comp_only") {
+      md <- dplyr::filter(
+        md,
+        `genome_drug.evidence` == "Computational Method" |
+          (!is.na(`genome_drug.computational_method`) & nzchar(`genome_drug.computational_method`)) |
+          (`genome_drug.laboratory_typing_method` == "Computational Prediction")
+      )
+    } else {
+      # Lab_or_comp: Doesn't matter, keep any predictions
+      md <- md
+    }
+
+    md <- md |>
       dplyr::filter(`genome.genome_quality` == "Good") |>
-      dplyr::filter(`genome_drug.resistant_phenotype` %in% c("Resistant", "Susceptible", "Intermediate"))
+      dplyr::filter(`genome_drug.resistant_phenotype` %in% c("Resistant","Susceptible","Intermediate")) |>
+      dplyr::distinct(`genome.genome_id`)  # keep BV-BRC column name
 
-    DBI::dbWriteTable(con, "filtered", filtered_metadata, overwrite = TRUE)
+    DBI::dbWriteTable(con, "filtered", md, overwrite = TRUE)
     if (isTRUE(verbose)) {
-      message("Post-filter rows: ", nrow(filtered_metadata))
+      message("Post-filter distinct genomes: ", nrow(md))
       message("Wrote table 'filtered' to: ", db_path)
     }
     return(list(duckdbConnection = con, table_name = "filtered"))
   }
 
-  # Attempt BV-BRC cache location
+  # No metadata fallback
   if (!isTRUE(fallback_to_bvbrc_cache)) {
     DBI::dbDisconnect(con, shutdown = TRUE)
     stop("No 'metadata' table found in ", db_path, ". Run retrieveMetadata() first.")
   }
-
-  if (isTRUE(verbose)) {
-    message("No 'metadata' in per-selection DB. Falling back to BV-BRC cache at data/bvbrc/.")
-  }
+  if (isTRUE(verbose)) message("No 'metadata' in per-selection DB. Falling back to BV-BRC cache at data/bvbrc/.")
 
   cache_db <- file.path(base_dir, "data", "bvbrc", "bvbrcData.duckdb")
   if (!file.exists(cache_db)) {
@@ -1036,7 +1275,7 @@ retrieveMetadata <- function(user_bacs,
     stop("BV-BRC cache not found at: ", cache_db, ". Run .updateBVBRCdata() first.")
   }
 
-  con_cache <- DBI::dbConnect(duckdb::duckdb(), dbdir = cache_db)
+  con_cache <- DBI::dbConnect(duckdb::duckdb(), dbdir = cache_db, read_only = TRUE)
   on.exit(try(DBI::dbDisconnect(con_cache, shutdown = TRUE), silent = TRUE), add = TRUE)
 
   if (!"bvbrc_bac_data" %in% DBI::dbListTables(con_cache)) {
@@ -1044,32 +1283,22 @@ retrieveMetadata <- function(user_bacs,
     stop("Table 'bvbrc_bac_data' not found in BV-BRC cache: ", cache_db)
   }
 
-  bv <- tibble::as_tibble(DBI::dbReadTable(con_cache, "bvbrc_bac_data"))
-
-
-  # Derive matches from user_bacs (taxon IDs or species)
+  bv  <- tibble::as_tibble(DBI::dbReadTable(con_cache, "bvbrc_bac_data"))
   sel <- tibble::tibble(`genome.genome_id` = character())
 
   for (v in user_bacs) {
-    if (suppressWarnings(!is.na(as.numeric(v)))) {
-      # numeric taxon_id match
-      matches <- bv[bv$`genome.taxon_id` == v |
-        bv$`genome.taxon_id` == as.numeric(v), , drop = FALSE]
+    if (.id_checker(v)) {
+      v_chr   <- as.character(v)
+      matches <- bv[bv$`genome.taxon_id` == v_chr, , drop = FALSE]
     } else {
-      # species substring (case-insensitive)
       matches <- bv[stringr::str_detect(
         bv$`genome.species`,
         stringr::fixed(v, ignore_case = TRUE)
       ), , drop = FALSE]
     }
-
     if (nrow(matches)) {
-      sel <- dplyr::bind_rows(
-        sel,
-        tibble::tibble(
-          `genome.genome_id` = as.character(matches$`genome.genome_id`)
-        )
-      )
+      sel <- dplyr::bind_rows(sel,
+                              tibble::tibble(`genome.genome_id` = as.character(matches$`genome.genome_id`)))
     }
   }
   sel <- dplyr::distinct(sel)
@@ -1079,37 +1308,14 @@ retrieveMetadata <- function(user_bacs,
     stop("No genomes matched user_bacs in BV-BRC cache.")
   }
 
-  # Minimal 'filtered' for downstream steps (downloaders & genomeList)
-  minimal_filtered <- sel
-  # Ensure expected column name used downstream:
-  # retrieveGenomes() reads "filtered" and expects `genome.genome_id` to exist.
-  DBI::dbWriteTable(con, "filtered", minimal_filtered, overwrite = TRUE)
-
-  if (isTRUE(verbose)) {
-    message("Wrote table 'filtered' to: ", db_path)
-  }
+  DBI::dbWriteTable(con, "filtered", sel, overwrite = TRUE)
+  if (isTRUE(verbose)) message("Wrote table 'filtered' to: ", db_path)
 
   list(duckdbConnection = con, table_name = "filtered")
 }
 
-
-# BV-BRC downloader block
-# Normalize Docker path
-.docker_path <- function(p) gsub("\\\\", "/", normalizePath(p, mustWork = FALSE))
-
-# Prefer bash
-.pick_shell <- function(image) {
-  chk <- suppressWarnings(system2("docker",
-    c(
-      "run", "--rm", image, "sh", "-lc",
-      "command -v bash >/dev/null || echo NOBASH"
-    ),
-    stdout = TRUE, stderr = TRUE
-  ))
-  if (length(chk) && any(grepl("NOBASH", chk))) "sh" else "bash"
-}
-
-# Check if a complete set (.fna + .PATRIC.faa + .PATRIC.gff) exists after DL
+#' Helps check if a complete set exists after DL (.fna + .PATRIC.faa + .PATRIC.gff)
+#' @keywords internal
 .is_complete_set <- function(dir, genomeID, min_bytes = 100) {
   fna <- file.path(dir, paste0(genomeID, ".fna"))
   faa <- file.path(dir, paste0(genomeID, ".PATRIC.faa"))
@@ -1119,17 +1325,14 @@ retrieveMetadata <- function(user_bacs,
     all(vapply(paths, function(x) file.info(x)$size, numeric(1)) > min_bytes)
 }
 
-# List genomes already completed
+#' Helps collate completed genomes into a set
+#' @keywords internal
 .list_complete <- function(dir, genome_ids, min_bytes = 100) {
   genome_ids[vapply(genome_ids, .is_complete_set, logical(1), dir = dir, min_bytes = min_bytes)]
 }
 
-# Any genomes missing bits or pieces? Find em
-.missing_any <- function(dir, genome_ids, min_bytes = 100) {
-  genome_ids[!vapply(genome_ids, .is_complete_set, logical(1), dir = dir, min_bytes = min_bytes)]
-}
-
-# Audit function
+#' Helps in auditing downloaded files to ensure everything's complete per ID
+#' @keywords internal
 .audit_gaps <- function(out_dir, ids, min_bytes = 100) {
   out_dir <- normalizePath(out_dir, mustWork = FALSE)
   df <- data.frame(
@@ -1148,68 +1351,27 @@ retrieveMetadata <- function(user_bacs,
   df
 }
 
-# FTPS preferred! HTTPS as a fallback to see if it's just an FTPS issue
-.ftp_download_one <- function(genomeID, out_dir, tries = 3L, min_bytes = 100) {
-  files <- c(".fna", ".PATRIC.faa", ".PATRIC.gff")
-  dests <- file.path(out_dir, paste0(genomeID, files))
-  if (.is_complete_set(out_dir, genomeID, min_bytes)) {
-    return(TRUE)
-  }
-
-  get_one <- function(url, dest) {
-    if (nzchar(Sys.which("wget"))) {
-      res <- suppressWarnings(system2("wget",
-        c("-q", "-O", shQuote(dest), shQuote(url)),
-        stdout = TRUE, stderr = TRUE
-      ))
-      st <- attr(res, "status")
-      if (is.null(st)) st <- 0L
-      return(st == 0L && file.exists(dest) && file.info(dest)$size > min_bytes)
-    } else if (nzchar(Sys.which("curl"))) {
-      curl_args <- if (startsWith(url, "ftps://")) {
-        c("--ssl-reqd", "-L", "-o", shQuote(dest), shQuote(url))
-      } else {
-        c("-L", "-o", shQuote(dest), shQuote(url))
-      }
-      res <- suppressWarnings(system2("curl", curl_args, stdout = TRUE, stderr = TRUE))
-      st <- attr(res, "status")
-      if (is.null(st)) st <- 0L
-      return(st == 0L && file.exists(dest) && file.info(dest)$size > min_bytes)
-    }
-    FALSE
-  }
+### BV-BRC CLI downloader [slower by comparison, but does not need FTP server]
 
-  for (ext_i in seq_along(files)) {
-    dest <- dests[ext_i]
-    if (file.exists(dest) && file.info(dest)$size > min_bytes) next
-    ftps <- sprintf("ftps://ftp.bv-brc.org/genomes/%s/%s%s", genomeID, genomeID, files[ext_i])
-    https <- sprintf("https://ftp.bv-brc.org/genomes/%s/%s%s", genomeID, genomeID, files[ext_i])
-
-    ok <- FALSE
-    for (k in 1:tries) {
-      if (get_one(ftps, dest)) {
-        ok <- TRUE
-        break
-      }
-    }
-    if (!ok) {
-      for (k in 1:2) {
-        if (get_one(https, dest)) {
-          ok <- TRUE
-          break
-        }
-      }
-    }
-    if (!ok) {
-      return(FALSE)
-    }
-  }
-  .is_complete_set(out_dir, genomeID, min_bytes)
-}
+#' Helps normalize Docker paths
+#' @keywords internal
+.docker_path <- function(p) gsub("\\\\", "/", normalizePath(p, mustWork = FALSE))
 
-# BV-BRC CLI downloader [very slow by comparison, but does not need FTP server]
+#' Helps run a shell inside a container, and prefers bash (don't we all?)
+#' @keywords internal
+.pick_shell <- function(image) {
+  chk <- suppressWarnings(system2("docker",
+                                  c(
+                                    "run", "--rm", image, "sh", "-lc",
+                                    "command -v bash >/dev/null || echo NOBASH"
+                                  ),
+                                  stdout = TRUE, stderr = TRUE
+  ))
+  if (length(chk) && any(grepl("NOBASH", chk))) "sh" else "bash"
+}
 
-# p3-dump-genomes to fetch FASTA and .gto files
+#' Using p3-dump-genomes in CLI to fetch FASTA and .gto files
+#' @keywords internal
 .cli_dump_fastas_gto_chunk <- function(image, out_dir, genome_ids, tag, tries = 3L) {
   out_dir <- normalizePath(out_dir, mustWork = FALSE)
   dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
@@ -1254,7 +1416,8 @@ retrieveMetadata <- function(user_bacs,
   st == 0L
 }
 
-# Export GFF from GTO per genomes in each chunk
+#' Exports GFF files from the downloaded GTO per genome in each chunk
+#' @keywords internal
 .cli_export_gff_chunk <- function(image, out_dir, genome_ids, tag, tries = 3L) {
   out_dir <- normalizePath(out_dir, mustWork = FALSE)
   ids_file <- file.path(out_dir, paste0("ids_", tag, ".txt"))
@@ -1344,36 +1507,45 @@ retrieveGenomes <- function(base_dir = ".",
                             method = c("ftp", "cli"),
                             image = "danylmb/bvbrc:5.3",
                             skip_existing = TRUE,
-                            ftp_workers = 8L,
+                            ftp_workers = 4L,
                             cli_fasta_workers = 4L,
                             cli_gff_workers = 4L,
                             chunk_size = 50L,
+                            evidence_mode = c("lab_only","lab_or_comp","comp_only","any"),  # NEW
                             verbose = TRUE) {
   method <- match.arg(method)
+  evidence_mode <- match.arg(evidence_mode)
   base_dir <- normalizePath(base_dir, mustWork = FALSE)
 
-  # IDs from .filterGenomes()
-  if (isTRUE(verbose)) message("Filtering genomes before download.")
-  f_out <- .filterGenomes(base_dir = base_dir, user_bacs = user_bacs, verbose = verbose)
-  if (is.null(f_out)) {
-    return(character())
+  # Use 'filtered' if already prepared, or start filtering
+  if (isTRUE(verbose)) message("Preparing download set (checking for existing 'filtered').")
+  paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs)
+  db_path <- paths$db_path
+  con0 <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path)
+  has_filtered <- "filtered" %in% DBI::dbListTables(con0)
+  if (has_filtered) {
+    if (isTRUE(verbose)) message("Using existing 'filtered' table (skipping re-filter).")
+    con <- con0; tbl <- "filtered"
+  } else {
+    if (isTRUE(verbose)) message("No 'filtered' table found; filtering now.")
+    f_out <- .filterGenomes(base_dir = base_dir,
+                            user_bacs = user_bacs,
+                            evidence_mode = evidence_mode,
+                            verbose = verbose)
+    con <- f_out$duckdbConnection
+    tbl <- f_out$table_name
   }
 
-  con <- f_out$duckdbConnection
-  tbl <- f_out$table_name
   ids <- tibble::as_tibble(DBI::dbReadTable(con, tbl)) |>
     dplyr::distinct(`genome.genome_id`) |>
     dplyr::pull(`genome.genome_id`)
 
-  # Set up the paths and such
-  paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs)
-  bug_dir <- dirname(paths$db_path)
+  bug_dir <- dirname(db_path)
   genome_path <- file.path(bug_dir, "genomes")
   logs_dir <- file.path(base_dir, "data", "logs")
   dir.create(genome_path, recursive = TRUE, showWarnings = FALSE)
   dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE)
 
-  # Adding support for resuming a download
   if (isTRUE(skip_existing)) {
     already <- .list_complete(genome_path, ids)
     if (isTRUE(verbose)) message(length(already), " genomes already completed; skipping.")
@@ -1391,60 +1563,59 @@ retrieveGenomes <- function(base_dir = ".",
     return(all_complete)
   }
 
-  # FTP method -- preferred if server is available
   if (identical(method, "ftp")) {
-    if (isTRUE(verbose)) message("Trying FTPS download. Workers=", ftp_workers)
-    future::plan(future::multisession, workers = max(1, ftp_workers))
-    ft_ok <- future.apply::future_lapply(ids, function(gid) .ftp_download_one(gid, genome_path),
-      future.seed = TRUE
-    )
-    ok_ids <- ids[unlist(ft_ok)]
-    if (isTRUE(verbose)) message("Complete file sets for ", length(ok_ids), " genomes (FTP).")
-    return(c(ok_ids, .list_complete(genome_path, setdiff(ids, ok_ids))))
-  }
-
-  # CLI for FASTA, FAA, and GTO, then GFF from GTO
-  chunks <- split(ids, ceiling(seq_along(ids) / chunk_size))
-
-  # Parallel chunk containers
-  if (isTRUE(verbose)) {
-    message(
-      "CLI being run in parallel for ", length(chunks),
-      " data chunks."
+    if (isTRUE(verbose)) message("Downloading by FTPS.")
+    ok_ids <- .ftpes_download_two_pass(
+      genome_ids     = ids,
+      out_dir        = genome_path,
+      workers_first  = ftp_workers,
+      workers_second = ftp_workers,
+      log_file       = file.path(logs_dir, "ftp_download.log")
     )
+    if (isTRUE(verbose)) {
+      message("Complete file sets for ", length(ok_ids), " genomes total.")
+      miss <- setdiff(ids, ok_ids)
+      if (length(miss)) message("Excluded (failed after retry): ", length(miss))
+    }
+    return(ok_ids)
   }
-  future::plan(future::multisession, workers = max(1, cli_fasta_workers))
-  fa_res <- future.apply::future_mapply(
-    FUN = function(vec, tag) .cli_dump_fastas_gto_chunk(image, genome_path, vec, tag),
-    vec = chunks, tag = paste0("fa", seq_along(chunks)),
-    SIMPLIFY = TRUE, future.seed = TRUE
-  )
-  if (!all(fa_res) && isTRUE(verbose)) warning(sum(!fa_res), " data chunks failed.")
 
-  # GFF extraction in parallel containers
-  if (isTRUE(verbose)) {
-    message(
-      "GFF extraction being run in parallel for ",
-      length(chunks), " data chunks."
-    )
-  }
-  future::plan(future::multisession, workers = max(1, cli_gff_workers))
-  g_res <- future.apply::future_mapply(
-    FUN = function(vec, tag) .cli_export_gff_chunk(image, genome_path, vec, tag),
-    vec = chunks, tag = paste0("gff", seq_along(chunks)),
-    SIMPLIFY = TRUE, future.seed = TRUE
-  )
-  if (!all(g_res) && isTRUE(verbose)) warning(sum(!g_res), " GFF chunks had failures.")
+  if (identical(method, "cli")) {
+    if (isTRUE(verbose)) {
+      message(
+        "CLI mode: downloading genomes in parallelized chunks. ",
+        "Chunks=", ceiling(length(ids)/chunk_size),
+        " | fasta_workers=", cli_fasta_workers,
+        " | gff_workers=", cli_gff_workers
+      )
+    }
 
-  # Success set: .fna + .PATRIC.faa + .PATRIC.gff all present per isolate
-  ok_ids <- ids[vapply(ids, .is_complete_set, logical(1), dir = genome_path)]
-  if (isTRUE(verbose)) {
-    message(
-      "Complete file sets downloaded for ",
-      length(ok_ids), " genomes."
-    )
+    chunks <- split(ids, ceiling(seq_along(ids) / chunk_size))
+
+    # FASTA + GTO in parallel containers
+    future::plan(future::multisession, workers = max(1, cli_fasta_workers))
+    invisible(future.apply::future_mapply(
+      FUN = function(vec, tag) .cli_dump_fastas_gto_chunk(image, genome_path, vec, tag),
+      vec = chunks, tag = paste0("fa", seq_along(chunks)),
+      SIMPLIFY = TRUE, future.seed = TRUE
+    ))
+
+    # GFF export in parallel containers
+    future::plan(future::multisession, workers = max(1, cli_gff_workers))
+    invisible(future.apply::future_mapply(
+      FUN = function(vec, tag) .cli_export_gff_chunk(image, genome_path, vec, tag),
+      vec = chunks, tag = paste0("gff", seq_along(chunks)),
+      SIMPLIFY = TRUE, future.seed = TRUE
+    ))
+
+    ok_ids <- ids[vapply(ids, .is_complete_set, logical(1), dir = genome_path)]
+    if (isTRUE(verbose)) {
+      message("CLI complete file sets for ", length(ok_ids), " genomes.")
+      miss <- setdiff(ids, ok_ids)
+      if (length(miss)) message("Excluded (CLI completion failed): ", length(miss))
+    }
+    return(ok_ids)
   }
-  ok_ids
 }
 
 #' Build a table of local genome file paths and write to DuckDB
@@ -1494,7 +1665,7 @@ genomeList <- function(base_dir = ".",
       faa_path = if (file.exists(faa_path) && file.info(faa_path)$size > 100) faa_path else NA,
       panaroo_input = if (
         file.exists(gff_path) && file.exists(fna_path) && file.exists(faa_path) &&
-          file.info(gff_path)$size > 100 && file.info(fna_path)$size > 100 && file.info(faa_path)$size > 100
+        file.info(gff_path)$size > 100 && file.info(fna_path)$size > 100 && file.info(faa_path)$size > 100
       ) {
         paste(gff_path, fna_path)
       } else {
@@ -1546,6 +1717,9 @@ genomeList <- function(base_dir = ".",
 #'   `"ftp"` (default) or `"cli"`.
 #' @param overwrite Logical. Passed to metadata filtering and DuckDB creation.
 #'   Default FALSE.
+#' @param evidence_mode Character. Sets what types of AMR evidence is acceptable.
+#'    Default `lab_only`. `any` will not require AMR data for downloads. This will
+#'    return very large download lists for many species!
 #' @param verbose Logical. Print progress messages. Default TRUE.
 #'
 #' @return A list (the output of `genomeList()`), containing:
@@ -1557,32 +1731,64 @@ prepareGenomes <- function(user_bacs,
                            base_dir = ".",
                            method = c("ftp", "cli"),
                            overwrite = FALSE,
+                           evidence_mode = c("lab_only","lab_or_comp","comp_only","any"),
                            verbose = TRUE) {
   method <- match.arg(method)
+  evidence_mode <- match.arg(evidence_mode)
   base_dir <- normalizePath(base_dir, mustWork = FALSE)
 
-  # Ensure the BV-BRC metadata cache exists, fetch if missing
   .ensure_bvbrc_cache(base_dir = base_dir, verbose = verbose)
 
-  if (isTRUE(verbose)) message("Step 1: Downloading genomes from BV-BRC")
+  if (isTRUE(verbose)) message("Step 0: Building AMR metadata (retrieveMetadata)")
+  invisible(retrieveMetadata(
+    user_bacs = user_bacs,
+    filter_type = "AMR",
+    base_dir = base_dir,
+    abx = "All",
+    overwrite = overwrite,
+    verbose = verbose
+  ))
+
+  if (isTRUE(verbose)) message("Step 1: Filtering genomes for download by evidence: ", evidence_mode)
+  f_out <- .filterGenomes(
+    base_dir      = base_dir,
+    user_bacs     = user_bacs,
+    evidence_mode = evidence_mode,
+    verbose       = verbose,
+    fallback_to_bvbrc_cache = FALSE
+  )
+  if (is.null(f_out)) {
+    message("No genomes available after evidence filtering.")
+    return(NULL)
+  }
 
-  # Filter + download
+  # A little summary of what's left after filtering (or not)
+  paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs)
+  con   <- DBI::dbConnect(duckdb::duckdb(), dbdir = paths$db_path, read_only = TRUE)
+  n_filtered <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM filtered')$n
+  n_meta     <- if ("genome_data" %in% DBI::dbListTables(con)) DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM genome_data')$n else NA
+  n_amr      <- if ("amr_phenotype" %in% DBI::dbListTables(con)) DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n else NA
+  DBI::dbDisconnect(con, shutdown = TRUE)
+  if (isTRUE(verbose)) {
+    message(sprintf("Evidence filter summary: filtered=%d | genomes with AMR=%s | genomes with genome_data=%s",
+                    n_filtered, ifelse(is.na(n_amr), "NA", n_amr), ifelse(is.na(n_meta), "NA", n_meta)))
+  }
+
+  if (isTRUE(verbose)) message("Step 2: Downloading genomes from BV-BRC (", method, ")")
   ids <- retrieveGenomes(
     base_dir      = base_dir,
     user_bacs     = user_bacs,
     method        = method,
     skip_existing = !overwrite,
+    evidence_mode = evidence_mode,
     verbose       = verbose
   )
-
   if (length(ids) == 0L) {
-    message("No genomes available after filtering.")
+    message("No genomes downloaded.")
     return(NULL)
   }
 
-  if (isTRUE(verbose)) message("Step 2: Formatting data into a database for further processing")
-
-  # List local files into per-selection DB
+  if (isTRUE(verbose)) message("Step 3: Formatting data into a database for further processing")
   out <- genomeList(
     base_dir  = base_dir,
     user_bacs = user_bacs,
@@ -1590,7 +1796,7 @@ prepareGenomes <- function(user_bacs,
   )
 
   if (isTRUE(verbose)) {
-    message("Done. Files are ready! Please continue to downstream processing by `runDataProcessing()`")
+    message("Done. Files are ready! Continue with downstream processing with runDataProcessing().")
   }
   out
 }

From 5287d1bc90488555be025cf7a28f96933a447fab Mon Sep 17 00:00:00 2001
From: epbrenner <epbrenner@users.noreply.github.com>
Date: Wed, 25 Feb 2026 22:17:03 +0000
Subject: [PATCH 4/4] Style code (GHA)

---
 R/data_curation.R | 182 +++++++++++++++++++++++++++-------------------
 1 file changed, 106 insertions(+), 76 deletions(-)

diff --git a/R/data_curation.R b/R/data_curation.R
index ac5095b..3cad422 100644
--- a/R/data_curation.R
+++ b/R/data_curation.R
@@ -8,7 +8,7 @@
 #' Helps tag genomes with their AMR evidence for parsing
 #' @keywords internal
 .create_amr_tagged_view <- function(con) {
-  lab_methods  <- c("Disk diffusion", "MIC", "Broth dilution", "Agar dilution")
+  lab_methods <- c("Disk diffusion", "MIC", "Broth dilution", "Agar dilution")
   lab_list_sql <- paste(DBI::dbQuoteString(con, lab_methods), collapse = ", ")
   comp_str_sql <- DBI::dbQuoteString(con, "Computational Method")
 
@@ -35,16 +35,15 @@
 #' @keywords internal
 .ftpes_download_one <- function(genomeID, out_dir,
                                 connect_timeout = 10L,
-                                max_time        = 30L,
-                                speed_time      = 30L,    # end if <speed_limit for >speed_time
-                                speed_limit     = 2048L,  # B/s
-                                min_bytes       = 100L) {
-
+                                max_time = 30L,
+                                speed_time = 30L, # end if <speed_limit for >speed_time
+                                speed_limit = 2048L, # B/s
+                                min_bytes = 100L) {
   dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
 
   exts <- c(".fna", ".PATRIC.faa", ".PATRIC.gff")
   for (ext in exts) {
-    dest     <- file.path(out_dir, paste0(genomeID, ext))
+    dest <- file.path(out_dir, paste0(genomeID, ext))
     dest_tmp <- paste0(dest, ".tmp")
 
     # Skip if we already completed these
@@ -57,17 +56,18 @@
     args <- c(
       "--fail", "--silent", "--show-error", "-L",
       "--connect-timeout", as.character(connect_timeout),
-      "--max-time",        as.character(max_time),
-      "--speed-time",      as.character(speed_time),
-      "--speed-limit",     as.character(speed_limit),
-      "--ftp-ssl",                 # AUTH TLS on port 21 works in testing
+      "--max-time", as.character(max_time),
+      "--speed-time", as.character(speed_time),
+      "--speed-limit", as.character(speed_limit),
+      "--ftp-ssl", # AUTH TLS on port 21 works in testing
       "--ftp-pasv", "--disable-epsv", "--ipv4",
       "--user", "anonymous:",
       "-o", shQuote(dest_tmp), shQuote(url)
     )
 
     res <- suppressWarnings(system2("curl", args = args, stdout = TRUE, stderr = TRUE))
-    status <- attr(res, "status"); if (is.null(status)) status <- 0L
+    status <- attr(res, "status")
+    if (is.null(status)) status <- 0L
 
     # Avoiding 0B files cluttering up results -- atomic rename on successful tmp DL
     ok <- (status == 0L && file.exists(dest_tmp) && file.info(dest_tmp)$size > min_bytes)
@@ -79,7 +79,7 @@
       }
     } else {
       try(unlink(dest_tmp), silent = TRUE)
-      return(FALSE)  # Abort early, don't accept partial set
+      return(FALSE) # Abort early, don't accept partial set
     }
   }
 
@@ -92,16 +92,19 @@
 #' hiccup. If 2nd pass fails, log and give up on that file.
 #' @keywords internal
 .ftpes_download_two_pass <- function(genome_ids, out_dir,
-                                     workers_first  = 4L,
+                                     workers_first = 4L,
                                      workers_second = 4L,
-                                     log_file       = NULL) {
+                                     log_file = NULL) {
   genome_ids <- unique(as.character(genome_ids))
-  if (!length(genome_ids)) return(character(0))
+  if (!length(genome_ids)) {
+    return(character(0))
+  }
 
   if (!is.null(log_file)) {
     dir.create(dirname(log_file), recursive = TRUE, showWarnings = FALSE)
     cat(sprintf("[%s] FTPS run start: %d genomes\n", Sys.time(), length(genome_ids)),
-        file = log_file, append = TRUE)
+      file = log_file, append = TRUE
+    )
   }
 
   # Pass 1: 30s per-file cap
@@ -111,8 +114,9 @@
     genome_ids,
     function(gid) {
       ok <- .ftpes_download_one(gid, out_dir,
-                                connect_timeout = 10L, max_time = 30L,
-                                speed_time = 30L, speed_limit = 2048L)
+        connect_timeout = 10L, max_time = 30L,
+        speed_time = 30L, speed_limit = 2048L
+      )
       list(gid = gid, ok = ok)
     },
     future.seed = TRUE
@@ -123,12 +127,16 @@
   message(sprintf("Pass 1: ok=%d, fail=%d", length(ok_ids_1), length(fail_ids)))
   if (!is.null(log_file)) {
     cat(sprintf("[%s] Pass1 ok=%d fail=%d\n", Sys.time(), length(ok_ids_1), length(fail_ids)),
-        file = log_file, append = TRUE)
+      file = log_file, append = TRUE
+    )
   }
 
   if (!length(fail_ids)) {
-    if (!is.null(log_file)) cat(sprintf("[%s] FTPS run end: all OK\n", Sys.time()),
-                                file = log_file, append = TRUE)
+    if (!is.null(log_file)) {
+      cat(sprintf("[%s] FTPS run end: all OK\n", Sys.time()),
+        file = log_file, append = TRUE
+      )
+    }
     return(ok_ids_1)
   }
 
@@ -139,22 +147,25 @@
     fail_ids,
     function(gid) {
       ok <- .ftpes_download_one(gid, out_dir,
-                                connect_timeout = 10L, max_time = 60L,
-                                speed_time = 30L, speed_limit = 2048L)
+        connect_timeout = 10L, max_time = 60L,
+        speed_time = 30L, speed_limit = 2048L
+      )
       list(gid = gid, ok = ok)
     },
     future.seed = TRUE
   )
   ok2 <- vapply(res2, `[[`, logical(1), "ok")
-  ok_ids_2   <- fail_ids[ok2]
+  ok_ids_2 <- fail_ids[ok2]
   still_fail <- setdiff(fail_ids, ok_ids_2)
   message(sprintf("Pass 2: ok=%d, still_fail=%d", length(ok_ids_2), length(still_fail)))
   if (!is.null(log_file)) {
     cat(sprintf("[%s] Pass2 ok=%d still_fail=%d\n", Sys.time(), length(ok_ids_2), length(still_fail)),
-        file = log_file, append = TRUE)
+      file = log_file, append = TRUE
+    )
     if (length(still_fail)) {
       cat("Fail IDs (excluded): ", paste(head(still_fail, 50), collapse = ", "), "\n",
-          file = log_file, append = TRUE)
+        file = log_file, append = TRUE
+      )
     }
   }
 
@@ -646,7 +657,7 @@
   }
 
   # Per-selection DB path
-  paths   <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite)
+  paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite)
   db_path <- paths$db_path
 
   # Got a cache? Use that, it's fast
@@ -691,11 +702,11 @@
   on.exit(try(DBI::dbDisconnect(con, shutdown = TRUE), silent = TRUE), add = TRUE)
 
   bac_data_tbl <- data.frame(
-    `genome.genome_id`   = cache_rows$gid,
+    `genome.genome_id` = cache_rows$gid,
     `genome.genome_name` = cache_rows$genome_name,
-    `genome.taxon_id`    = cache_rows$taxon_id,
-    `genome.species`     = cache_rows$species,
-    `genome.strain`      = cache_rows$strain,
+    `genome.taxon_id` = cache_rows$taxon_id,
+    `genome.species` = cache_rows$species,
+    `genome.strain` = cache_rows$strain,
     stringsAsFactors = FALSE
   )
   DBI::dbWriteTable(con, "bac_data", bac_data_tbl, overwrite = TRUE)
@@ -984,7 +995,8 @@ retrieveMetadata <- function(user_bacs,
     envir = environment()
   )
   parallel::clusterEvalQ(cluster, {
-    library(tibble); library(dplyr)
+    library(tibble)
+    library(dplyr)
   })
 
   # Pull AMR metadata
@@ -1039,18 +1051,19 @@ retrieveMetadata <- function(user_bacs,
     dplyr::mutate(`genome.genome_id` = as.character(`genome.genome_id`))
 
   # Write & join
-  paths   <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite)
+  paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite)
   db_path <- paths$db_path
   logs_dir <- file.path(base_dir, "data", "logs")
   dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE)
   cat(sprintf("[%s] Writing metadata DuckDB: %s\n", Sys.time(), db_path),
-      file = file.path(logs_dir, "bvbrc.log"), append = TRUE)
+    file = file.path(logs_dir, "bvbrc.log"), append = TRUE
+  )
 
   con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path)
   on.exit(try(DBI::dbDisconnect(con, shutdown = TRUE), silent = TRUE), add = TRUE)
 
   DBI::dbWriteTable(con, "amr_phenotype", combined_drug_data_tbl, overwrite = TRUE)
-  DBI::dbWriteTable(con, "genome_data",  combined_genome_data_tbl, overwrite = TRUE)
+  DBI::dbWriteTable(con, "genome_data", combined_genome_data_tbl, overwrite = TRUE)
 
   if (isTRUE(verbose)) message("Joining AMR phenotype and genome metadata.")
   DBI::dbExecute(con, '
@@ -1062,22 +1075,27 @@ retrieveMetadata <- function(user_bacs,
   ')
 
   # Debug summary after writes
-  n_targets   <- length(genome_ids)
-  n_amr_ids   <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n
+  n_targets <- length(genome_ids)
+  n_amr_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n
   n_gmeta_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id")      AS n FROM genome_data')$n
   if (isTRUE(verbose)) {
-    message("Initial summary: targets=", n_targets,
-            " | AMR genomes=", n_amr_ids,
-            " | genome_data genomes=", n_gmeta_ids)
+    message(
+      "Initial summary: targets=", n_targets,
+      " | AMR genomes=", n_amr_ids,
+      " | genome_data genomes=", n_gmeta_ids
+    )
   }
 
   # Tagged view
   .create_amr_tagged_view(con)
 
   # Final debug summary
-  n_bac <- if ("bac_data" %in% DBI::dbListTables(con))
-    DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM bac_data')$n else NA_integer_
-  n_amr_ids   <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n
+  n_bac <- if ("bac_data" %in% DBI::dbListTables(con)) {
+    DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM bac_data')$n
+  } else {
+    NA_integer_
+  }
+  n_amr_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n
   n_gmeta_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id")      AS n FROM genome_data')$n
 
   ids_zero_amr <- DBI::dbGetQuery(
@@ -1183,16 +1201,21 @@ retrieveMetadata <- function(user_bacs,
 #' @return A list with a DuckDB connection and table_name = "filtered"
 .filterGenomes <- function(user_bacs,
                            base_dir = ".",
-                           evidence_mode = c("lab_only","lab_or_comp","comp_only","any"),
+                           evidence_mode = c("lab_only", "lab_or_comp", "comp_only", "any"),
                            verbose = TRUE,
                            fallback_to_bvbrc_cache = TRUE) {
   evidence_mode <- match.arg(evidence_mode)
   base_dir <- normalizePath(base_dir, mustWork = FALSE)
-  paths    <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs)
-  db_path  <- paths$db_path
+  paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs)
+  db_path <- paths$db_path
 
   con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path)
-  on.exit({ NULL }, add = TRUE)  # keep open for caller
+  on.exit(
+    {
+      NULL
+    },
+    add = TRUE
+  ) # keep open for caller
 
   # The convenient "Metadata Exists" path
   if ("metadata" %in% DBI::dbListTables(con)) {
@@ -1251,8 +1274,8 @@ retrieveMetadata <- function(user_bacs,
 
     md <- md |>
       dplyr::filter(`genome.genome_quality` == "Good") |>
-      dplyr::filter(`genome_drug.resistant_phenotype` %in% c("Resistant","Susceptible","Intermediate")) |>
-      dplyr::distinct(`genome.genome_id`)  # keep BV-BRC column name
+      dplyr::filter(`genome_drug.resistant_phenotype` %in% c("Resistant", "Susceptible", "Intermediate")) |>
+      dplyr::distinct(`genome.genome_id`) # keep BV-BRC column name
 
     DBI::dbWriteTable(con, "filtered", md, overwrite = TRUE)
     if (isTRUE(verbose)) {
@@ -1283,12 +1306,12 @@ retrieveMetadata <- function(user_bacs,
     stop("Table 'bvbrc_bac_data' not found in BV-BRC cache: ", cache_db)
   }
 
-  bv  <- tibble::as_tibble(DBI::dbReadTable(con_cache, "bvbrc_bac_data"))
+  bv <- tibble::as_tibble(DBI::dbReadTable(con_cache, "bvbrc_bac_data"))
   sel <- tibble::tibble(`genome.genome_id` = character())
 
   for (v in user_bacs) {
     if (.id_checker(v)) {
-      v_chr   <- as.character(v)
+      v_chr <- as.character(v)
       matches <- bv[bv$`genome.taxon_id` == v_chr, , drop = FALSE]
     } else {
       matches <- bv[stringr::str_detect(
@@ -1297,8 +1320,10 @@ retrieveMetadata <- function(user_bacs,
       ), , drop = FALSE]
     }
     if (nrow(matches)) {
-      sel <- dplyr::bind_rows(sel,
-                              tibble::tibble(`genome.genome_id` = as.character(matches$`genome.genome_id`)))
+      sel <- dplyr::bind_rows(
+        sel,
+        tibble::tibble(`genome.genome_id` = as.character(matches$`genome.genome_id`))
+      )
     }
   }
   sel <- dplyr::distinct(sel)
@@ -1361,11 +1386,11 @@ retrieveMetadata <- function(user_bacs,
 #' @keywords internal
 .pick_shell <- function(image) {
   chk <- suppressWarnings(system2("docker",
-                                  c(
-                                    "run", "--rm", image, "sh", "-lc",
-                                    "command -v bash >/dev/null || echo NOBASH"
-                                  ),
-                                  stdout = TRUE, stderr = TRUE
+    c(
+      "run", "--rm", image, "sh", "-lc",
+      "command -v bash >/dev/null || echo NOBASH"
+    ),
+    stdout = TRUE, stderr = TRUE
   ))
   if (length(chk) && any(grepl("NOBASH", chk))) "sh" else "bash"
 }
@@ -1511,7 +1536,7 @@ retrieveGenomes <- function(base_dir = ".",
                             cli_fasta_workers = 4L,
                             cli_gff_workers = 4L,
                             chunk_size = 50L,
-                            evidence_mode = c("lab_only","lab_or_comp","comp_only","any"),  # NEW
+                            evidence_mode = c("lab_only", "lab_or_comp", "comp_only", "any"), # NEW
                             verbose = TRUE) {
   method <- match.arg(method)
   evidence_mode <- match.arg(evidence_mode)
@@ -1525,13 +1550,16 @@ retrieveGenomes <- function(base_dir = ".",
   has_filtered <- "filtered" %in% DBI::dbListTables(con0)
   if (has_filtered) {
     if (isTRUE(verbose)) message("Using existing 'filtered' table (skipping re-filter).")
-    con <- con0; tbl <- "filtered"
+    con <- con0
+    tbl <- "filtered"
   } else {
     if (isTRUE(verbose)) message("No 'filtered' table found; filtering now.")
-    f_out <- .filterGenomes(base_dir = base_dir,
-                            user_bacs = user_bacs,
-                            evidence_mode = evidence_mode,
-                            verbose = verbose)
+    f_out <- .filterGenomes(
+      base_dir = base_dir,
+      user_bacs = user_bacs,
+      evidence_mode = evidence_mode,
+      verbose = verbose
+    )
     con <- f_out$duckdbConnection
     tbl <- f_out$table_name
   }
@@ -1584,7 +1612,7 @@ retrieveGenomes <- function(base_dir = ".",
     if (isTRUE(verbose)) {
       message(
         "CLI mode: downloading genomes in parallelized chunks. ",
-        "Chunks=", ceiling(length(ids)/chunk_size),
+        "Chunks=", ceiling(length(ids) / chunk_size),
         " | fasta_workers=", cli_fasta_workers,
         " | gff_workers=", cli_gff_workers
       )
@@ -1665,7 +1693,7 @@ genomeList <- function(base_dir = ".",
       faa_path = if (file.exists(faa_path) && file.info(faa_path)$size > 100) faa_path else NA,
       panaroo_input = if (
         file.exists(gff_path) && file.exists(fna_path) && file.exists(faa_path) &&
-        file.info(gff_path)$size > 100 && file.info(fna_path)$size > 100 && file.info(faa_path)$size > 100
+          file.info(gff_path)$size > 100 && file.info(fna_path)$size > 100 && file.info(faa_path)$size > 100
       ) {
         paste(gff_path, fna_path)
       } else {
@@ -1731,7 +1759,7 @@ prepareGenomes <- function(user_bacs,
                            base_dir = ".",
                            method = c("ftp", "cli"),
                            overwrite = FALSE,
-                           evidence_mode = c("lab_only","lab_or_comp","comp_only","any"),
+                           evidence_mode = c("lab_only", "lab_or_comp", "comp_only", "any"),
                            verbose = TRUE) {
   method <- match.arg(method)
   evidence_mode <- match.arg(evidence_mode)
@@ -1751,10 +1779,10 @@ prepareGenomes <- function(user_bacs,
 
   if (isTRUE(verbose)) message("Step 1: Filtering genomes for download by evidence: ", evidence_mode)
   f_out <- .filterGenomes(
-    base_dir      = base_dir,
-    user_bacs     = user_bacs,
+    base_dir = base_dir,
+    user_bacs = user_bacs,
     evidence_mode = evidence_mode,
-    verbose       = verbose,
+    verbose = verbose,
     fallback_to_bvbrc_cache = FALSE
   )
   if (is.null(f_out)) {
@@ -1764,14 +1792,16 @@ prepareGenomes <- function(user_bacs,
 
   # A little summary of what's left after filtering (or not)
   paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs)
-  con   <- DBI::dbConnect(duckdb::duckdb(), dbdir = paths$db_path, read_only = TRUE)
+  con <- DBI::dbConnect(duckdb::duckdb(), dbdir = paths$db_path, read_only = TRUE)
   n_filtered <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM filtered')$n
-  n_meta     <- if ("genome_data" %in% DBI::dbListTables(con)) DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM genome_data')$n else NA
-  n_amr      <- if ("amr_phenotype" %in% DBI::dbListTables(con)) DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n else NA
+  n_meta <- if ("genome_data" %in% DBI::dbListTables(con)) DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM genome_data')$n else NA
+  n_amr <- if ("amr_phenotype" %in% DBI::dbListTables(con)) DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n else NA
   DBI::dbDisconnect(con, shutdown = TRUE)
   if (isTRUE(verbose)) {
-    message(sprintf("Evidence filter summary: filtered=%d | genomes with AMR=%s | genomes with genome_data=%s",
-                    n_filtered, ifelse(is.na(n_amr), "NA", n_amr), ifelse(is.na(n_meta), "NA", n_meta)))
+    message(sprintf(
+      "Evidence filter summary: filtered=%d | genomes with AMR=%s | genomes with genome_data=%s",
+      n_filtered, ifelse(is.na(n_amr), "NA", n_amr), ifelse(is.na(n_meta), "NA", n_meta)
+    ))
   }
 
   if (isTRUE(verbose)) message("Step 2: Downloading genomes from BV-BRC (", method, ")")