From 95243c4c714d0b70cde906529860184e6e2cdc96 Mon Sep 17 00:00:00 2001 From: Abhirupa Ghosh <100681585+AbhirupaGhosh@users.noreply.github.com> Date: Thu, 19 Feb 2026 08:54:16 -0700 Subject: [PATCH 1/4] Split cleanData and add Parquet exports Rename the original cleanData to cleanMetaData and add roxygen skeleton. Introduce a writeCompressedParquet helper and export cleaned metadata, AMR phenotype, genome data and original metadata to compressed Parquet files, then create a separate DuckDB (parquet-backed) with views for metadata, amr_phenotype, genome_data and original_metadata. Reintroduce a new cleanData function focused on feature matrices (genes/proteins/domains/etc.) that writes feature tables to Parquet and creates corresponding views; remove duplicated metadata parquet exports from the feature-matrix flow. Minor whitespace and path-handling adjustments to normalize paths and ensure output directories exist. --- R/data_processing.R | 112 +++++++++++++++++++++++++++++++++----------- 1 file changed, 85 insertions(+), 27 deletions(-) diff --git a/R/data_processing.R b/R/data_processing.R index 78ab404..c417e52 100644 --- a/R/data_processing.R +++ b/R/data_processing.R @@ -1223,7 +1223,16 @@ domainFromIPR <- function(duckdb_path, } # Clean BV-BRC metadata, then save as Parquet files -cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){ +#' +#' @param duckdb_path +#' @param path +#' @param ref_file_path +#' +#' @returns +#' @export +#' +#' @examples +cleanMetaData <- function(duckdb_path, path, ref_file_path = "data_raw/"){ duckdb_path <- normalizePath(duckdb_path) # If no explicit path is provided (or a generic one), choose results// when # the DuckDB lives under data//, or else fall back to the DuckDB directory. @@ -1237,14 +1246,14 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){ ) path <- if (!identical(mapped_results, bug_dir)) mapped_results else bug_dir } - + path <- normalizePath(path, mustWork = FALSE) if (!dir.exists(path)) dir.create(path, recursive = TRUE) - + con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path) on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE) ref_file_path <- normalizePath(ref_file_path) - + clean_drug <- readr::read_tsv(file.path(ref_file_path, "clean_drug.tsv")) drug_class <- readr::read_tsv(file.path(ref_file_path, "drug_class.tsv")) drug_abbr <- readr::read_tsv(file.path(ref_file_path, "drug_abbr.tsv")) @@ -1252,7 +1261,7 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){ clean_countries <- readr::read_tsv(file.path(ref_file_path, "cleaned_bvbrc_countries.tsv")) |> dplyr::select("raw_entry", "clean_name", "short_name")|> dplyr::distinct() - + dplyr::tbl(con, "filtered") |> tibble::as_tibble() |> dplyr::select("genome_drug.genome_id", "genome_drug.antibiotic", @@ -1267,7 +1276,7 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){ dplyr::left_join(drug_abbr, by = c("cleaned_drug" = "drug")) |> dplyr::left_join(class_abbr, by = "drug_class") |> DBI::dbWriteTable(conn = con, name = "filtered", overwrite = TRUE) - + resistance_summary <- dplyr::tbl(con, "filtered") |> tibble::as_tibble() |> dplyr::filter(genome_drug.resistant_phenotype == "Resistant") |> @@ -1276,8 +1285,9 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){ num_resistant_classes = dplyr::n_distinct(drug_class), resistant_classes = paste(unique(class_abbr), collapse = "_") ) - + year_breaks <- seq(1980, 2023, by = 5) + dplyr::tbl(con, "filtered") |> tibble::as_tibble() |> dplyr::mutate(genome_drug.antibiotic = cleaned_drug) |> @@ -1301,6 +1311,74 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){ labels = paste(year_breaks[-length(year_breaks)], year_breaks[-1] - 1, sep = "-"))) |> DBI::dbWriteTable(conn = con, name = "cleaned_metadata", overwrite = TRUE) + + # Parquet output path + metadata_parquet <- file.path(path, "metadata.parquet") # cleaned_metadata exported as 'metadata' + + # Also export AMR/genome/original metadata + amr_phenotype_parquet <- file.path(path, "amr_phenotype.parquet") + genome_data_parquet <- file.path(path, "genome_data.parquet") + original_metadata_parquet <- file.path(path, "original_metadata.parquet") + + writeCompressedParquet <- function(df, path) { + arrow::write_parquet( + df, + path, + compression = "zstd", + compression_level = 9, + use_dictionary = TRUE + ) + } + + db_name <- duckdb_path |> stringr::str_split_i(".duckdb", i = 1) |> paste0("_parquet.duckdb") + con_new <- DBI::dbConnect(duckdb::duckdb(), db_name) + on.exit(try(DBI::dbDisconnect(con_new, shutdown = FALSE), silent = TRUE), add = TRUE) + + # cleaned_metadata -> parquet + view (as metadata) + DBI::dbReadTable(con, "cleaned_metadata") |> writeCompressedParquet(metadata_parquet) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW metadata AS SELECT * FROM read_parquet('%s')", metadata_parquet)) + + # debug/complete views: amr_phenotype, genome_data, original_metadata + DBI::dbReadTable(con, "amr_phenotype") |> writeCompressedParquet(amr_phenotype_parquet) + DBI::dbReadTable(con, "genome_data") |> writeCompressedParquet(genome_data_parquet) + DBI::dbReadTable(con, "metadata") |> writeCompressedParquet(original_metadata_parquet) + + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW amr_phenotype AS SELECT * FROM read_parquet('%s')", amr_phenotype_parquet)) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW genome_data AS SELECT * FROM read_parquet('%s')", genome_data_parquet)) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW original_metadata AS SELECT * FROM read_parquet('%s')", original_metadata_parquet)) + + invisible(TRUE) +} + +# Clean feature matrices, then save as Parquet files +#' +#' @param duckdb_path +#' @param path +#' +#' @returns +#' @export +#' +#' @examples +cleanData <- function(duckdb_path, path){ + duckdb_path <- normalizePath(duckdb_path) + # If no explicit path is provided (or a generic one), choose results// when + # the DuckDB lives under data//, or else fall back to the DuckDB directory. + if (missing(path) || path %in% c(".", "results", "results/")) { + bug_dir <- dirname(duckdb_path) + mapped_results <- sub( + paste0(.Platform$file.sep, "data", .Platform$file.sep), + paste0(.Platform$file.sep, "results", .Platform$file.sep), + bug_dir, + fixed = TRUE + ) + path <- if (!identical(mapped_results, bug_dir)) mapped_results else bug_dir + } + + path <- normalizePath(path, mustWork = FALSE) + if (!dir.exists(path)) dir.create(path, recursive = TRUE) + + con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path) + on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE) # Parquet output paths genes_parquet <- file.path(path, "gene_count.parquet") @@ -1312,18 +1390,11 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){ proteins_parquet <- file.path(path, "protein_count.parquet") domains_parquet <- file.path(path, "domain_count.parquet") - metadata_parquet <- file.path(path, "metadata.parquet") # cleaned_metadata exported as 'metadata' - domain_names_parquet <- file.path(path, "domain_names.parquet") protein_names_parquet <- file.path(path, "protein_names.parquet") protein_cluster_seq_parquet <- file.path(path, "protein_seqs.parquet") - # Also export AMR/genome/original metadata - amr_phenotype_parquet <- file.path(path, "amr_phenotype.parquet") - genome_data_parquet <- file.path(path, "genome_data.parquet") - original_metadata_parquet <- file.path(path, "original_metadata.parquet") - writeCompressedParquet <- function(df, path) { arrow::write_parquet( df, @@ -1370,10 +1441,6 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){ writeCompressedParquet(struct_parquet) DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW struct AS SELECT * FROM read_parquet('%s')", struct_parquet)) - # cleaned_metadata -> parquet + view (as metadata) - DBI::dbReadTable(con, "cleaned_metadata") |> writeCompressedParquet(metadata_parquet) - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW metadata AS SELECT * FROM read_parquet('%s')", metadata_parquet)) - # names/seq tables -> parquet + views DBI::dbReadTable(con, "gene_names") |> writeCompressedParquet(gene_names_parquet) DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW gene_names AS SELECT * FROM read_parquet('%s')", gene_names_parquet)) @@ -1397,15 +1464,6 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){ DBI::dbReadTable(con, "genome_gene_protein") |> writeCompressedParquet(genome_gene_protein_parquet) DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW genome_gene_protein AS SELECT * FROM read_parquet('%s')", genome_gene_protein_parquet)) - # debug/complete views: amr_phenotype, genome_data, original_metadata - DBI::dbReadTable(con, "amr_phenotype") |> writeCompressedParquet(amr_phenotype_parquet) - DBI::dbReadTable(con, "genome_data") |> writeCompressedParquet(genome_data_parquet) - DBI::dbReadTable(con, "metadata") |> writeCompressedParquet(original_metadata_parquet) - - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW amr_phenotype AS SELECT * FROM read_parquet('%s')", amr_phenotype_parquet)) - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW genome_data AS SELECT * FROM read_parquet('%s')", genome_data_parquet)) - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW original_metadata AS SELECT * FROM read_parquet('%s')", original_metadata_parquet)) - invisible(TRUE) } From c52aaa78a50e4ae956aaa541b6c988554c18f506 Mon Sep 17 00:00:00 2001 From: AbhirupaGhosh Date: Thu, 19 Feb 2026 15:56:27 +0000 Subject: [PATCH 2/4] Style code (GHA) --- R/data_curation.R | 428 ++++++++++++++++++++++++++------------------ R/data_processing.R | 418 +++++++++++++++++++++++------------------- vignettes/intro.Rmd | 123 +++++++------ 3 files changed, 549 insertions(+), 420 deletions(-) diff --git a/R/data_curation.R b/R/data_curation.R index b1f1af5..0d19113 100644 --- a/R/data_curation.R +++ b/R/data_curation.R @@ -45,8 +45,10 @@ } # Parse - df <- utils::read.table(text = raw_data, sep = "\t", header = TRUE, fill = TRUE, - quote = "", check.names = FALSE, comment.char = "") + df <- utils::read.table( + text = raw_data, sep = "\t", header = TRUE, fill = TRUE, + quote = "", check.names = FALSE, comment.char = "" + ) df <- tibble::as_tibble(df) |> dplyr::mutate(dplyr::across(dplyr::everything(), ~ iconv(.x, from = "", to = "UTF-8", sub = ""))) @@ -61,7 +63,7 @@ # Make sure the BV-BRC metadata live where they're supposed to .ensure_bvbrc_cache <- function(base_dir = ".", verbose = TRUE, - cache_rel = file.path("data", "bvbrc", "bvbrcData.duckdb"), + cache_rel = file.path("data", "bvbrc", "bvbrcData.duckdb"), cache_table = "bvbrc_bac_data") { base_dir <- normalizePath(base_dir, mustWork = FALSE) cache_db <- file.path(base_dir, cache_rel) @@ -115,12 +117,12 @@ base_dir <- normalizePath(base_dir, mustWork = FALSE) data_dir <- file.path(base_dir, "data") bvbrc_dir <- file.path(data_dir, "bvbrc") - logs_dir <- file.path(data_dir, "logs") + logs_dir <- file.path(data_dir, "logs") dir.create(bvbrc_dir, recursive = TRUE, showWarnings = FALSE) - dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE) + dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE) - db_path <- file.path(bvbrc_dir, "bvbrcData.duckdb") + db_path <- file.path(bvbrc_dir, "bvbrcData.duckdb") table_name <- "bvbrc_bac_data" meta_table <- "__meta" @@ -128,10 +130,10 @@ DBI::dbExecute( con, - glue::glue('CREATE TABLE IF NOT EXISTS {meta_table} ( + glue::glue("CREATE TABLE IF NOT EXISTS {meta_table} ( table_name TEXT PRIMARY KEY, last_updated TIMESTAMP - )') + )") ) # Tiny update check helper @@ -139,12 +141,14 @@ res <- tryCatch( DBI::dbGetQuery( con, - glue::glue('SELECT last_updated FROM {meta_table} - WHERE table_name = {DBI::dbQuoteString(con, table_name)}') + glue::glue("SELECT last_updated FROM {meta_table} + WHERE table_name = {DBI::dbQuoteString(con, table_name)}") ), error = function(e) NULL ) - if (is.null(res) || nrow(res) == 0L) return(NA) + if (is.null(res) || nrow(res) == 0L) { + return(NA) + } as.POSIXct(res$last_updated[[1]], origin = "1970-01-01", tz = "UTC") } @@ -268,7 +272,6 @@ } - #' Resolve `query value` from `user_bacs` for [.getGenomeIDs()] #' #' If query_value is NULL, derive it from user_bacs based on query_type. @@ -277,7 +280,9 @@ #' If nothing suitable is found, throw a fit and an error. #' @keywords internal .resolveQueryValue <- function(query_type, query_value, user_bacs) { - if (!is.null(query_value) && nzchar(query_value)) return(query_value) + if (!is.null(query_value) && nzchar(query_value)) { + return(query_value) + } if (missing(user_bacs) || length(user_bacs) == 0) { stop("Provide query_value or user_bacs for the selected query_type.") } @@ -359,7 +364,7 @@ stringr::str_replace_all("\\s+", "_") |> stringr::str_replace_all("[^A-Za-z0-9._-]", "") - db_dir <- file.path(data_dir, bug_dirname) + db_dir <- file.path(data_dir, bug_dirname) dir.create(db_dir, recursive = TRUE, showWarnings = FALSE) db_file <- paste0(.generateDBname(user_bacs), ".duckdb") @@ -397,14 +402,13 @@ overwrite = FALSE, image = "danylmb/bvbrc:5.3", verbose = TRUE) { - - query_type <- match.arg(query_type) + query_type <- match.arg(query_type) query_value <- .resolveQueryValue(query_type, query_value, user_bacs) - + if (isTRUE(verbose)) { message("Querying BV-BRC: ", query_type, " == ", query_value) } - + # Count count_cmd <- paste0( "docker run --rm ", image, @@ -414,10 +418,10 @@ " --in genome_status,WGS,Complete", " --count" ) - count_lines <- tryCatch(system(count_cmd, intern = TRUE), error = function(e) character()) + count_lines <- tryCatch(system(count_cmd, intern = TRUE), error = function(e) character()) count_result <- suppressWarnings(as.integer(if (length(count_lines) >= 2) count_lines[2] else NA_integer_)) if (isTRUE(verbose) && !is.na(count_result)) message("Count returned: ", count_result) - + # Details data_cmd <- paste0( "docker run --rm ", image, @@ -429,7 +433,7 @@ ) data_raw <- tryCatch(system(data_cmd, intern = TRUE), error = function(e) character()) if (length(data_raw) == 0L) stop("BV-BRC returned no data for: ", query_type, " = ", query_value) - + data_result <- tibble::as_tibble( utils::read.table( text = data_raw, sep = "\t", header = TRUE, fill = TRUE, @@ -443,16 +447,16 @@ `genome.species` = as.character(`genome.species`), `genome.strain` = as.character(`genome.strain`) ) - + # Per-bug DB path - paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) db_path <- paths$db_path - + con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path) DBI::dbWriteTable(con, "bac_data", data_result, overwrite = TRUE) - + if (isTRUE(verbose)) message("Wrote table 'bac_data' to: ", db_path) - + list(count_result = count_result, duckdbConnection = con, table_name = "bac_data") } @@ -474,28 +478,28 @@ overwrite = FALSE, verbose = TRUE) { base_dir <- normalizePath(base_dir, mustWork = FALSE) - + if (isTRUE(verbose)) message("Resolving input taxa.") bac_input_data <- .retrieveCustomQuery(base_dir = base_dir, user_bacs = user_bacs) - + if (is.null(bac_input_data) || nrow(bac_input_data) == 0) { message("No valid input provided or no matches found.") return(NULL) } - + # Resolve per-bug DB path - paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) db_path <- paths$db_path - - bac_data <- tibble::tibble() + + bac_data <- tibble::tibble() taxon_ids <- unique(bac_input_data$genome.taxon_id) - + if (isTRUE(verbose)) message("Querying BV-BRC for ", length(taxon_ids), " taxon IDs.") - + for (i in seq_along(taxon_ids)) { tax <- taxon_ids[i] if (isTRUE(verbose)) message("Taxon ", i, "/", length(taxon_ids), ": ", tax) - + res <- .getGenomeIDs( base_dir = base_dir, query_type = "taxon_id", @@ -504,22 +508,22 @@ overwrite = TRUE, verbose = verbose ) - + con <- res$duckdbConnection tbl <- res$table_name each_bac_data <- tibble::as_tibble(DBI::dbReadTable(con, tbl)) bac_data <- dplyr::bind_rows(bac_data, each_bac_data) - + # Close per-iteration connection to avoid open handles piling up try(DBI::dbDisconnect(con, shutdown = TRUE), silent = TRUE) } - + if (nrow(bac_data) > 0) { genome_ids <- bac_data |> dplyr::distinct(`genome.genome_id`) |> dplyr::pull(`genome.genome_id`) genome_ids <- genome_ids[!is.na(genome_ids)] - + if (length(genome_ids) > 0) { if (isTRUE(verbose)) message("Collected ", length(genome_ids), " distinct genome IDs.") return(genome_ids) @@ -578,18 +582,18 @@ verbose = TRUE) { base_dir <- normalizePath(base_dir, mustWork = FALSE) data_dir <- file.path(base_dir, "data") - tmp_dir <- file.path(data_dir, "tmp") + tmp_dir <- file.path(data_dir, "tmp") dir.create(tmp_dir, recursive = TRUE, showWarnings = FALSE) - + if (isTRUE(verbose)) { message("Preparing AMR query input for ", length(batch_genome_IDs), " genomes.") } - + docker_path <- Sys.which("docker") if (!nzchar(docker_path)) { stop("Docker is not available on your PATH but is required.") } - + # Generate genome list with p3-echo echo_args <- c( "run", "--rm", @@ -599,12 +603,12 @@ genome_ids_output <- suppressWarnings( system2("docker", args = echo_args, stdout = TRUE, stderr = TRUE) ) - + # Write a temporary file in data/tmp/ tmp_in <- tempfile(tmpdir = tmp_dir, pattern = "genome_drug_ids_", fileext = ".tsv") writeLines(genome_ids_output, con = tmp_in) tmp_in_mounted <- file.path("/data", "tmp", basename(tmp_in)) - + # Allow abx_filter to be a single string with spaces OR a vector of args abx_args <- if (length(abx_filter) == 1L) { @@ -612,7 +616,7 @@ } else { abx_filter } - + # Query drug data drug_args <- c( "run", "--rm", @@ -622,15 +626,15 @@ abx_args, "--attr", drug_fields ) - + if (isTRUE(verbose)) { message("Running AMR query.") } drug_data <- suppressWarnings(system2("docker", args = drug_args, stdout = TRUE, stderr = TRUE)) - + # Clean up after yourself try(unlink(tmp_in), silent = TRUE) - + return(drug_data) } @@ -655,18 +659,18 @@ verbose = TRUE) { base_dir <- normalizePath(base_dir, mustWork = FALSE) data_dir <- file.path(base_dir, "data") - tmp_dir <- file.path(data_dir, "tmp") + tmp_dir <- file.path(data_dir, "tmp") dir.create(tmp_dir, recursive = TRUE, showWarnings = FALSE) - + if (isTRUE(verbose)) { message("Preparing genome metadata input for ", length(batch_genome_IDs), " genomes.") } - + docker_path <- Sys.which("docker") if (!nzchar(docker_path)) { stop("Docker is not available on your PATH but is required.") } - + # Generate genome list with p3-echo echo_args <- c( "run", "--rm", @@ -676,14 +680,14 @@ genome_ids_output <- suppressWarnings( system2("docker", args = echo_args, stdout = TRUE, stderr = TRUE) ) - + tmp_in <- tempfile(tmpdir = tmp_dir, pattern = "genome_ids_", fileext = ".tsv") writeLines(genome_ids_output, con = tmp_in) tmp_in_mounted <- file.path("/data", "tmp", basename(tmp_in)) - + # Choose attributes (AMR for this pipeline) chosen_fields <- if (identical(filter_type, "AMR")) amr_fields else microtrait_fields - + get_args <- c( "run", "--rm", "-v", paste0(data_dir, ":/data"), @@ -691,15 +695,15 @@ "--input", tmp_in_mounted, "--attr", chosen_fields ) - + if (isTRUE(verbose)) { message("Running genome metadata query.") } genome_data <- suppressWarnings(system2("docker", args = get_args, stdout = TRUE, stderr = TRUE)) - + # Cleaning up try(unlink(tmp_in), silent = TRUE) - + return(genome_data) } @@ -735,8 +739,10 @@ retrieveMetadata <- function(user_bacs, base_dir <- normalizePath(base_dir, mustWork = FALSE) if (isTRUE(verbose)) message("Resolving genome IDs for user inputs.") - genome_ids <- .retrieveQueryIDs(base_dir = base_dir, user_bacs = user_bacs, - overwrite = overwrite, verbose = verbose) + genome_ids <- .retrieveQueryIDs( + base_dir = base_dir, user_bacs = user_bacs, + overwrite = overwrite, verbose = verbose + ) if (length(genome_ids) == 0) { message("No genome IDs available for the specified inputs.") return(NULL) @@ -753,8 +759,11 @@ retrieveMetadata <- function(user_bacs, "pmid,resistant_phenotype,", "source,taxon_id,testing_standard" ) - abx_filter <- if (identical(abx, "All")) "--required antibiotic" - else paste0("--in antibiotic,", paste(abx, collapse = ",")) + abx_filter <- if (identical(abx, "All")) { + "--required antibiotic" + } else { + paste0("--in antibiotic,", paste(abx, collapse = ",")) + } amr_fields <- paste0( "assembly_accession,assembly_method,", "bioproject_accession,biosample_accession,", @@ -808,7 +817,10 @@ retrieveMetadata <- function(user_bacs, ), envir = environment() ) - parallel::clusterEvalQ(cluster, { library(tibble); library(dplyr) }) + parallel::clusterEvalQ(cluster, { + library(tibble) + library(dplyr) + }) if (isTRUE(verbose)) message("Retrieving AMR phenotype data in batches.") batch_drug_data <- parallel::parLapply(cluster, genome_batches, function(batch) { @@ -822,7 +834,10 @@ retrieveMetadata <- function(user_bacs, ) }) combined_drug_data <- unlist(batch_drug_data, use.names = FALSE) - if (length(combined_drug_data) == 0) { message("No drug data returned."); return(NULL) } + if (length(combined_drug_data) == 0) { + message("No drug data returned.") + return(NULL) + } combined_drug_data_tbl <- tibble::as_tibble( utils::read.table( @@ -847,7 +862,10 @@ retrieveMetadata <- function(user_bacs, ) }) combined_genome_data <- unlist(batch_genome_data, use.names = FALSE) - if (length(combined_genome_data) == 0) { message("No genome data returned."); return(NULL) } + if (length(combined_genome_data) == 0) { + message("No genome data returned.") + return(NULL) + } combined_genome_data_tbl <- tibble::as_tibble( utils::read.table( @@ -860,13 +878,14 @@ retrieveMetadata <- function(user_bacs, dplyr::mutate(`genome.genome_id` = as.character(`genome.genome_id`)) # Per-bug DB path - paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) db_path <- paths$db_path logs_dir <- file.path(base_dir, "data", "logs") dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE) cat(sprintf("[%s] Writing metadata DuckDB: %s\n", Sys.time(), db_path), - file = file.path(logs_dir, "bvbrc.log"), append = TRUE) + file = file.path(logs_dir, "bvbrc.log"), append = TRUE + ) con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path) DBI::dbWriteTable(con, "amr_phenotype", combined_drug_data_tbl, overwrite = TRUE) @@ -892,10 +911,14 @@ retrieveMetadata <- function(user_bacs, # FASTA sanitizer to ensure Panaroo compatibility with BV-BRC CLI downloads .strip_fasta_preamble <- function(fna_path) { - if (!file.exists(fna_path)) return(invisible(FALSE)) + if (!file.exists(fna_path)) { + return(invisible(FALSE)) + } txt <- readLines(fna_path, warn = FALSE) first <- which(grepl("^\\s*>", txt))[1] - if (is.na(first)) return(invisible(FALSE)) + if (is.na(first)) { + return(invisible(FALSE)) + } if (first > 1L) { txt <- txt[first:length(txt)] txt[1] <- sub("^\\ufeff", "", txt[1]) @@ -907,14 +930,20 @@ retrieveMetadata <- function(user_bacs, # GFF sanitizer to ensure Panaroo compatibility with BV-BRC CLI downloads .sanitize_gff <- function(gff_path) { - if (!file.exists(gff_path)) return(invisible(FALSE)) + if (!file.exists(gff_path)) { + return(invisible(FALSE)) + } lines <- readLines(gff_path, warn = FALSE) - if (length(lines) == 0L) return(invisible(FALSE)) + if (length(lines) == 0L) { + return(invisible(FALSE)) + } if (!grepl("^##gff-version\\s*3", lines[1])) { lines <- c("##gff-version 3", lines) } out <- vapply(lines, function(line) { - if (grepl("^#", line)) return(line) + if (grepl("^#", line)) { + return(line) + } parts <- strsplit(line, "[\t ]", perl = TRUE)[[1]] if (length(parts) >= 9) { paste(c(parts[1:8], paste(parts[9:length(parts)], collapse = " ")), collapse = "\t") @@ -943,16 +972,19 @@ retrieveMetadata <- function(user_bacs, verbose = TRUE, fallback_to_bvbrc_cache = TRUE) { base_dir <- normalizePath(base_dir, mustWork = FALSE) - paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) - db_path <- paths$db_path - + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) + db_path <- paths$db_path + con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path) - - on.exit({ - # Keep connection open for caller - NULL - }, add = TRUE) - + + on.exit( + { + # Keep connection open for caller + NULL + }, + add = TRUE + ) + # Good metadata present? Apply AMR filters if ("metadata" %in% DBI::dbListTables(con)) { if (isTRUE(verbose)) message("Loading metadata for filtering.") @@ -962,7 +994,7 @@ retrieveMetadata <- function(user_bacs, message("No data available in 'metadata'.") return(NULL) } - + # Normalize evidence labels initial_metadata <- tibble::as_tibble(initial_metadata) |> dplyr::mutate( @@ -973,13 +1005,13 @@ retrieveMetadata <- function(user_bacs, TRUE ~ `genome_drug.evidence` ) ) - + # AMR and quality filtering filtered_metadata <- initial_metadata |> dplyr::filter(`genome_drug.evidence` == "Laboratory Method") |> dplyr::filter(`genome.genome_quality` == "Good") |> dplyr::filter(`genome_drug.resistant_phenotype` %in% c("Resistant", "Susceptible", "Intermediate")) - + DBI::dbWriteTable(con, "filtered", filtered_metadata, overwrite = TRUE) if (isTRUE(verbose)) { message("Post-filter rows: ", nrow(filtered_metadata)) @@ -987,42 +1019,42 @@ retrieveMetadata <- function(user_bacs, } return(list(duckdbConnection = con, table_name = "filtered")) } - + # Attempt BV-BRC cache location if (!isTRUE(fallback_to_bvbrc_cache)) { DBI::dbDisconnect(con, shutdown = TRUE) stop("No 'metadata' table found in ", db_path, ". Run retrieveMetadata() first.") } - + if (isTRUE(verbose)) { message("No 'metadata' in per-selection DB. Falling back to BV-BRC cache at data/bvbrc/.") } - + cache_db <- file.path(base_dir, "data", "bvbrc", "bvbrcData.duckdb") if (!file.exists(cache_db)) { DBI::dbDisconnect(con, shutdown = TRUE) stop("BV-BRC cache not found at: ", cache_db, ". Run .updateBVBRCdata() first.") } - + con_cache <- DBI::dbConnect(duckdb::duckdb(), dbdir = cache_db) on.exit(try(DBI::dbDisconnect(con_cache, shutdown = TRUE), silent = TRUE), add = TRUE) - + if (!"bvbrc_bac_data" %in% DBI::dbListTables(con_cache)) { DBI::dbDisconnect(con, shutdown = TRUE) stop("Table 'bvbrc_bac_data' not found in BV-BRC cache: ", cache_db) } - + bv <- tibble::as_tibble(DBI::dbReadTable(con_cache, "bvbrc_bac_data")) - - + + # Derive matches from user_bacs (taxon IDs or species) sel <- tibble::tibble(`genome.genome_id` = character()) - + for (v in user_bacs) { if (suppressWarnings(!is.na(as.numeric(v)))) { # numeric taxon_id match matches <- bv[bv$`genome.taxon_id` == v | - bv$`genome.taxon_id` == as.numeric(v), , drop = FALSE] + bv$`genome.taxon_id` == as.numeric(v), , drop = FALSE] } else { # species substring (case-insensitive) matches <- bv[stringr::str_detect( @@ -1030,7 +1062,7 @@ retrieveMetadata <- function(user_bacs, stringr::fixed(v, ignore_case = TRUE) ), , drop = FALSE] } - + if (nrow(matches)) { sel <- dplyr::bind_rows( sel, @@ -1041,22 +1073,22 @@ retrieveMetadata <- function(user_bacs, } } sel <- dplyr::distinct(sel) - + if (nrow(sel) == 0L) { DBI::dbDisconnect(con, shutdown = TRUE) stop("No genomes matched user_bacs in BV-BRC cache.") } - + # Minimal 'filtered' for downstream steps (downloaders & genomeList) minimal_filtered <- sel # Ensure expected column name used downstream: # retrieveGenomes() reads "filtered" and expects `genome.genome_id` to exist. DBI::dbWriteTable(con, "filtered", minimal_filtered, overwrite = TRUE) - + if (isTRUE(verbose)) { message("Wrote table 'filtered' to: ", db_path) } - + list(duckdbConnection = con, table_name = "filtered") } @@ -1068,9 +1100,12 @@ retrieveMetadata <- function(user_bacs, # Prefer bash .pick_shell <- function(image) { chk <- suppressWarnings(system2("docker", - c("run", "--rm", image, "sh", "-lc", - "command -v bash >/dev/null || echo NOBASH"), - stdout = TRUE, stderr = TRUE)) + c( + "run", "--rm", image, "sh", "-lc", + "command -v bash >/dev/null || echo NOBASH" + ), + stdout = TRUE, stderr = TRUE + )) if (length(chk) && any(grepl("NOBASH", chk))) "sh" else "bash" } @@ -1117,21 +1152,28 @@ retrieveMetadata <- function(user_bacs, .ftp_download_one <- function(genomeID, out_dir, tries = 3L, min_bytes = 100) { files <- c(".fna", ".PATRIC.faa", ".PATRIC.gff") dests <- file.path(out_dir, paste0(genomeID, files)) - if (.is_complete_set(out_dir, genomeID, min_bytes)) return(TRUE) + if (.is_complete_set(out_dir, genomeID, min_bytes)) { + return(TRUE) + } get_one <- function(url, dest) { if (nzchar(Sys.which("wget"))) { res <- suppressWarnings(system2("wget", - c("-q", "-O", shQuote(dest), shQuote(url)), - stdout = TRUE, stderr = TRUE)) - st <- attr(res, "status"); if (is.null(st)) st <- 0L + c("-q", "-O", shQuote(dest), shQuote(url)), + stdout = TRUE, stderr = TRUE + )) + st <- attr(res, "status") + if (is.null(st)) st <- 0L return(st == 0L && file.exists(dest) && file.info(dest)$size > min_bytes) } else if (nzchar(Sys.which("curl"))) { - curl_args <- if (startsWith(url, "ftps://")) - c("--ssl-reqd", "-L", "-o", shQuote(dest), shQuote(url)) else - c("-L", "-o", shQuote(dest), shQuote(url)) + curl_args <- if (startsWith(url, "ftps://")) { + c("--ssl-reqd", "-L", "-o", shQuote(dest), shQuote(url)) + } else { + c("-L", "-o", shQuote(dest), shQuote(url)) + } res <- suppressWarnings(system2("curl", curl_args, stdout = TRUE, stderr = TRUE)) - st <- attr(res, "status"); if (is.null(st)) st <- 0L + st <- attr(res, "status") + if (is.null(st)) st <- 0L return(st == 0L && file.exists(dest) && file.info(dest)$size > min_bytes) } FALSE @@ -1140,13 +1182,27 @@ retrieveMetadata <- function(user_bacs, for (ext_i in seq_along(files)) { dest <- dests[ext_i] if (file.exists(dest) && file.info(dest)$size > min_bytes) next - ftps <- sprintf("ftps://ftp.bv-brc.org/genomes/%s/%s%s", genomeID, genomeID, files[ext_i]) + ftps <- sprintf("ftps://ftp.bv-brc.org/genomes/%s/%s%s", genomeID, genomeID, files[ext_i]) https <- sprintf("https://ftp.bv-brc.org/genomes/%s/%s%s", genomeID, genomeID, files[ext_i]) ok <- FALSE - for (k in 1:tries) { if (get_one(ftps, dest)) { ok <- TRUE; break } } - if (!ok) for (k in 1:2) { if (get_one(https, dest)) { ok <- TRUE; break } } - if (!ok) return(FALSE) + for (k in 1:tries) { + if (get_one(ftps, dest)) { + ok <- TRUE + break + } + } + if (!ok) { + for (k in 1:2) { + if (get_one(https, dest)) { + ok <- TRUE + break + } + } + } + if (!ok) { + return(FALSE) + } } .is_complete_set(out_dir, genomeID, min_bytes) } @@ -1155,7 +1211,7 @@ retrieveMetadata <- function(user_bacs, # p3-dump-genomes to fetch FASTA and .gto files .cli_dump_fastas_gto_chunk <- function(image, out_dir, genome_ids, tag, tries = 3L) { - out_dir <- normalizePath(out_dir, mustWork = FALSE) + out_dir <- normalizePath(out_dir, mustWork = FALSE) dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) ids_file <- file.path(out_dir, paste0("ids_", tag, ".txt")) writeLines(genome_ids, ids_file) @@ -1164,12 +1220,15 @@ retrieveMetadata <- function(user_bacs, mount <- .docker_path(out_dir) # Safety against Windows-specific CRLF lines before `p3-dump-genomes` - sh_cmd <- sprintf('tr -d "\\r" < /out/%s | p3-dump-genomes --outDir /out --fasta --prot --gto -', - basename(ids_file)) + sh_cmd <- sprintf( + 'tr -d "\\r" < /out/%s | p3-dump-genomes --outDir /out --fasta --prot --gto -', + basename(ids_file) + ) - args <- c("run", "--rm", "-v", paste0(mount, ":/out"), image, shell, "-lc", shQuote(sh_cmd)) + args <- c("run", "--rm", "-v", paste0(mount, ":/out"), image, shell, "-lc", shQuote(sh_cmd)) res <- suppressWarnings(system2("docker", args = args, stdout = TRUE, stderr = TRUE)) - st <- attr(res, "status"); if (is.null(st)) st <- 0L + st <- attr(res, "status") + if (is.null(st)) st <- 0L if (st != 0L && tries > 1L) { Sys.sleep(1) @@ -1197,57 +1256,57 @@ retrieveMetadata <- function(user_bacs, # Export GFF from GTO per genomes in each chunk .cli_export_gff_chunk <- function(image, out_dir, genome_ids, tag, tries = 3L) { - out_dir <- normalizePath(out_dir, mustWork = FALSE) + out_dir <- normalizePath(out_dir, mustWork = FALSE) ids_file <- file.path(out_dir, paste0("ids_", tag, ".txt")) writeLines(genome_ids, ids_file) - exporter <- "/usr/bin/rast-export-genome" - shell <- .pick_shell(image) - mount <- .docker_path(out_dir) + exporter <- "/usr/bin/rast-export-genome" + shell <- .pick_shell(image) + mount <- .docker_path(out_dir) stderr_file <- file.path(out_dir, paste0("gff_chunk_", tag, ".stderr.txt")) # GFFs from BV-BRC .gto export are not directly compatible in Panaroo # This block reformats the contig IDs and ensures .gff/.fna pairs work together sh_cmd <- paste0( - 'set -euo pipefail; ', - 'fail_n=0; : > /out/', basename(stderr_file), '; ', + "set -euo pipefail; ", + "fail_n=0; : > /out/", basename(stderr_file), "; ", 'while IFS= read -r b || [ -n "$b" ]; do ', ' b=${b%$\'\\r\'}; [ -n "$b" ] || continue; ', ' gto="/out/${b}.gto"; gff="/out/${b}.PATRIC.gff"; map="/out/${b}.orig2id.tsv"; ', - ' if [ ! -s "$gto" ]; then echo "MISSING_GTO $b" >>/out/', basename(stderr_file), '; continue; fi; ', + ' if [ ! -s "$gto" ]; then echo "MISSING_GTO $b" >>/out/', basename(stderr_file), "; continue; fi; ", # Export GFF ' [ -s "$gff" ] || ', exporter, ' -i "$gto" -o "$gff" gff ', - ' || { echo "EXPORT_FAIL_GFF $b" >>/out/', basename(stderr_file), '; fail_n=$((fail_n+1)); continue; }; ', + ' || { echo "EXPORT_FAIL_GFF $b" >>/out/', basename(stderr_file), "; fail_n=$((fail_n+1)); continue; }; ", # Build mapping original_id -> id, and default to id if original_id missing - ' if command -v jq >/dev/null 2>&1; then ', + " if command -v jq >/dev/null 2>&1; then ", ' jq -r \'.contigs[] | [(.original_id // .id), .id] | @tsv\' "$gto" > "$map"; ', - ' else ', + " else ", ' python3 - "$gto" > "$map" <<\'PY\'\n', - 'import sys, json\n', - 'g = json.load(open(sys.argv[1]))\n', + "import sys, json\n", + "g = json.load(open(sys.argv[1]))\n", 'for c in g.get("contigs", []):\n', ' o = c.get("original_id") or c.get("id")\n', ' i = c.get("id")\n', - ' if o and i:\n', + " if o and i:\n", ' print(f"{o}\\t{i}")\n', - 'PY\n', - ' fi; ', + "PY\n", + " fi; ", # Relabel GFF sequence IDs: original_id -> id - ' awk \'FNR==NR{m[$1]=$2; next} ', - ' /^##sequence-region/ { if ($2 in m) {$2=m[$2]} print; next } ', - ' /^#/ { print; next } ', + " awk 'FNR==NR{m[$1]=$2; next} ", + " /^##sequence-region/ { if ($2 in m) {$2=m[$2]} print; next } ", + " /^#/ { print; next } ", ' { if ($1 in m) $1=m[$1]; print }\' "$map" "$gff" > "${gff}.tmp" && mv "${gff}.tmp" "$gff"; ', - - 'done < /out/', basename(ids_file), '; ', - 'exit 0' + "done < /out/", basename(ids_file), "; ", + "exit 0" ) args <- c("run", "--rm", "-v", paste0(mount, ":/out"), image, shell, "-lc", shQuote(sh_cmd)) - res <- suppressWarnings(system2("docker", args = args, stdout = TRUE, stderr = TRUE)) - st <- attr(res, "status"); if (is.null(st)) st <- 0L + res <- suppressWarnings(system2("docker", args = args, stdout = TRUE, stderr = TRUE)) + st <- attr(res, "status") + if (is.null(st)) st <- 0L if (st != 0L && tries > 1L) { Sys.sleep(1) @@ -1290,14 +1349,15 @@ retrieveGenomes <- function(base_dir = ".", cli_gff_workers = 4L, chunk_size = 50L, verbose = TRUE) { - - method <- match.arg(method) + method <- match.arg(method) base_dir <- normalizePath(base_dir, mustWork = FALSE) # IDs from .filterGenomes() if (isTRUE(verbose)) message("Filtering genomes before download.") f_out <- .filterGenomes(base_dir = base_dir, user_bacs = user_bacs, verbose = verbose) - if (is.null(f_out)) return(character()) + if (is.null(f_out)) { + return(character()) + } con <- f_out$duckdbConnection tbl <- f_out$table_name @@ -1306,12 +1366,12 @@ retrieveGenomes <- function(base_dir = ".", dplyr::pull(`genome.genome_id`) # Set up the paths and such - paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) - bug_dir <- dirname(paths$db_path) + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) + bug_dir <- dirname(paths$db_path) genome_path <- file.path(bug_dir, "genomes") - logs_dir <- file.path(base_dir, "data", "logs") + logs_dir <- file.path(base_dir, "data", "logs") dir.create(genome_path, recursive = TRUE, showWarnings = FALSE) - dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE) + dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE) # Adding support for resuming a download if (isTRUE(skip_existing)) { @@ -1322,10 +1382,12 @@ retrieveGenomes <- function(base_dir = ".", if (length(ids) == 0L) { if (isTRUE(verbose)) message("All genomes already complete.") - all_complete <- .list_complete(genome_path, - tibble::as_tibble(DBI::dbReadTable(con, tbl)) |> - dplyr::distinct(`genome.genome_id`) |> - dplyr::pull(`genome.genome_id`)) + all_complete <- .list_complete( + genome_path, + tibble::as_tibble(DBI::dbReadTable(con, tbl)) |> + dplyr::distinct(`genome.genome_id`) |> + dplyr::pull(`genome.genome_id`) + ) return(all_complete) } @@ -1334,18 +1396,23 @@ retrieveGenomes <- function(base_dir = ".", if (isTRUE(verbose)) message("Trying FTPS download. Workers=", ftp_workers) future::plan(future::multisession, workers = max(1, ftp_workers)) ft_ok <- future.apply::future_lapply(ids, function(gid) .ftp_download_one(gid, genome_path), - future.seed = TRUE) + future.seed = TRUE + ) ok_ids <- ids[unlist(ft_ok)] if (isTRUE(verbose)) message("Complete file sets for ", length(ok_ids), " genomes (FTP).") return(c(ok_ids, .list_complete(genome_path, setdiff(ids, ok_ids)))) } # CLI for FASTA, FAA, and GTO, then GFF from GTO - chunks <- split(ids, ceiling(seq_along(ids)/chunk_size)) + chunks <- split(ids, ceiling(seq_along(ids) / chunk_size)) # Parallel chunk containers - if (isTRUE(verbose)) message("CLI being run in parallel for ", length(chunks), - " data chunks.") + if (isTRUE(verbose)) { + message( + "CLI being run in parallel for ", length(chunks), + " data chunks." + ) + } future::plan(future::multisession, workers = max(1, cli_fasta_workers)) fa_res <- future.apply::future_mapply( FUN = function(vec, tag) .cli_dump_fastas_gto_chunk(image, genome_path, vec, tag), @@ -1355,8 +1422,12 @@ retrieveGenomes <- function(base_dir = ".", if (!all(fa_res) && isTRUE(verbose)) warning(sum(!fa_res), " data chunks failed.") # GFF extraction in parallel containers - if (isTRUE(verbose)) message("GFF extraction being run in parallel for ", - length(chunks), " data chunks.") + if (isTRUE(verbose)) { + message( + "GFF extraction being run in parallel for ", + length(chunks), " data chunks." + ) + } future::plan(future::multisession, workers = max(1, cli_gff_workers)) g_res <- future.apply::future_mapply( FUN = function(vec, tag) .cli_export_gff_chunk(image, genome_path, vec, tag), @@ -1367,8 +1438,12 @@ retrieveGenomes <- function(base_dir = ".", # Success set: .fna + .PATRIC.faa + .PATRIC.gff all present per isolate ok_ids <- ids[vapply(ids, .is_complete_set, logical(1), dir = genome_path)] - if (isTRUE(verbose)) message("Complete file sets downloaded for ", - length(ok_ids), " genomes.") + if (isTRUE(verbose)) { + message( + "Complete file sets downloaded for ", + length(ok_ids), " genomes." + ) + } ok_ids } @@ -1387,15 +1462,14 @@ retrieveGenomes <- function(base_dir = ".", genomeList <- function(base_dir = ".", user_bacs, verbose = TRUE) { - base_dir <- normalizePath(base_dir, mustWork = FALSE) - paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) - db_path <- paths$db_path - bug_dir <- dirname(db_path) + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) + db_path <- paths$db_path + bug_dir <- dirname(db_path) genome_path <- file.path(bug_dir, "genomes") - files_all <- list.files(genome_path, full.names = TRUE) - files_all <- files_all[file.info(files_all)$size > 100] + files_all <- list.files(genome_path, full.names = TRUE) + files_all <- files_all[file.info(files_all)$size > 100] # Separate by type gff_files <- files_all[grepl("\\.PATRIC\\.gff$", files_all)] @@ -1414,14 +1488,18 @@ genomeList <- function(base_dir = ".", faa_path <- file.path(genome_path, paste0(genomeID, ".PATRIC.faa")) data.frame( - genome_id = genomeID, - gff_path = if (file.exists(gff_path) && file.info(gff_path)$size > 100) gff_path else NA, - fna_path = if (file.exists(fna_path) && file.info(fna_path)$size > 100) fna_path else NA, - faa_path = if (file.exists(faa_path) && file.info(faa_path)$size > 100) faa_path else NA, + genome_id = genomeID, + gff_path = if (file.exists(gff_path) && file.info(gff_path)$size > 100) gff_path else NA, + fna_path = if (file.exists(fna_path) && file.info(fna_path)$size > 100) fna_path else NA, + faa_path = if (file.exists(faa_path) && file.info(faa_path)$size > 100) faa_path else NA, panaroo_input = if ( file.exists(gff_path) && file.exists(fna_path) && file.exists(faa_path) && - file.info(gff_path)$size > 100 && file.info(fna_path)$size > 100 && file.info(faa_path)$size > 100 - ) paste(gff_path, fna_path) else NA, + file.info(gff_path)$size > 100 && file.info(fna_path)$size > 100 && file.info(faa_path)$size > 100 + ) { + paste(gff_path, fna_path) + } else { + NA + }, stringsAsFactors = FALSE ) }) @@ -1480,7 +1558,7 @@ prepareGenomes <- function(user_bacs, method = c("ftp", "cli"), overwrite = FALSE, verbose = TRUE) { - method <- match.arg(method) + method <- match.arg(method) base_dir <- normalizePath(base_dir, mustWork = FALSE) # Ensure the BV-BRC metadata cache exists, fetch if missing diff --git a/R/data_processing.R b/R/data_processing.R index c417e52..01b47b8 100644 --- a/R/data_processing.R +++ b/R/data_processing.R @@ -81,7 +81,7 @@ # Convert to container-visible paths genome_filepath_cont <- .to_container(genome_filepath_host, host_root = mount_host, container_root = mount_cont) - output_dir_cont <- .to_container(output_dir_host, host_root = mount_host, container_root = mount_cont) + output_dir_cont <- .to_container(output_dir_host, host_root = mount_host, container_root = mount_cont) # Run Panaroo in Docker cmd_args <- c( @@ -99,14 +99,17 @@ "--remove-invalid-genes", "--core_threshold", as.character(core_threshold), "--len_dif_percent", as.character(len_dif_percent), - "--threshold", as.character(cluster_threshold), - "-f", as.character(family_seq_identity), - "-t", as.character(panaroo_threads_per_job) + "--threshold", as.character(cluster_threshold), + "-f", as.character(family_seq_identity), + "-t", as.character(panaroo_threads_per_job) ) - res <- tryCatch({ - system2("docker", args = cmd_args, stdout = TRUE, stderr = TRUE) - }, error = function(e) e) + res <- tryCatch( + { + system2("docker", args = cmd_args, stdout = TRUE, stderr = TRUE) + }, + error = function(e) e + ) if (inherits(res, "error")) { stop(sprintf("Docker/Panaroo failed to launch: %s", res$message)) @@ -179,7 +182,6 @@ family_seq_identity = 0.5, threads = 8, split_jobs = FALSE) { - duckdb_path <- normalizePath(duckdb_path) con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path) on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE) @@ -213,7 +215,7 @@ filtered_panaroo_input <- sapply(split_files[unlist(valid_entries)], paste, collapse = " ") total_lines <- length(filtered_panaroo_input) - batch_size <- if (isTRUE(split_jobs)) ceiling(total_lines / 5) else total_lines + batch_size <- if (isTRUE(split_jobs)) ceiling(total_lines / 5) else total_lines panaroo_batches <- split(filtered_panaroo_input, ceiling(seq_along(filtered_panaroo_input) / batch_size)) n_jobs <- length(panaroo_batches) @@ -265,8 +267,7 @@ len_dif_percent = 0.95, cluster_threshold = 0.95, family_seq_identity = 0.5, - threads = 8){ - + threads = 8) { input_path <- .docker_path(input_path) # Fail fast if Docker is missing @@ -302,9 +303,9 @@ "--merge_paralogs", "--core_threshold", as.character(core_threshold), "--len_dif_percent", as.character(len_dif_percent), - "--threshold", as.character(cluster_threshold), - "-f", as.character(family_seq_identity), - "-t", as.character(threads) + "--threshold", as.character(cluster_threshold), + "-f", as.character(family_seq_identity), + "-t", as.character(threads) ) system2("docker", args = cmd_args, stdout = TRUE, stderr = TRUE) @@ -314,7 +315,6 @@ } - #' Load Panaroo gene presence/absence table into DuckDB #' #' Reads `gene_presence_absence.csv` and constructs a genome-by-gene count @@ -326,13 +326,13 @@ #' @return A tibble containing the gene count matrix. #' #' @keywords internal -.panaroo2geneTable <- function(panaroo_output_path, duckdb_path){ +.panaroo2geneTable <- function(panaroo_output_path, duckdb_path) { filepath <- file.path(normalizePath(panaroo_output_path), "gene_presence_absence.csv") duckdb_path <- normalizePath(duckdb_path) con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path) on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE) - gene_count <- read.table(filepath, sep=",", header=TRUE, fill=TRUE, quote="") |> + gene_count <- read.table(filepath, sep = ",", header = TRUE, fill = TRUE, quote = "") |> tibble::as_tibble() |> dplyr::select(-c(Non.unique.Gene.name, Annotation)) |> tidyr::pivot_longer(cols = -1) |> @@ -356,13 +356,13 @@ #' @return A tibble with `Gene` and `Annotation` columns. #' #' @keywords internal -.panaroo2geneNames <- function(panaroo_output_path, duckdb_path){ +.panaroo2geneNames <- function(panaroo_output_path, duckdb_path) { filepath <- file.path(normalizePath(panaroo_output_path), "gene_presence_absence.csv") duckdb_path <- normalizePath(duckdb_path) con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path) on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE) - gene_names <- read.table(filepath, sep=",", header=TRUE, fill=TRUE, quote="") |> + gene_names <- read.table(filepath, sep = ",", header = TRUE, fill = TRUE, quote = "") |> tibble::as_tibble() |> dplyr::select(c(Gene, Annotation)) @@ -381,15 +381,15 @@ #' @return A tibble containing the struct matrix. #' #' @keywords internal -.panaroo2StructTable <- function(panaroo_output_path, duckdb_path){ +.panaroo2StructTable <- function(panaroo_output_path, duckdb_path) { struct_filepath <- file.path(normalizePath(panaroo_output_path), "struct_presence_absence.Rtab") duckdb_path <- normalizePath(duckdb_path) con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path) on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE) - gene_struct <- read.table(struct_filepath, sep="\t", header=TRUE, fill=TRUE, quote="") |> + gene_struct <- read.table(struct_filepath, sep = "\t", header = TRUE, fill = TRUE, quote = "") |> tibble::as_tibble() |> - tidyr::pivot_longer(cols= -1) |> + tidyr::pivot_longer(cols = -1) |> tidyr::pivot_wider(names_from = Gene, values_from = value) |> dplyr::rename("genome_id" = "name") |> dplyr::mutate(genome_id = stringr::str_replace_all(genome_id, c("^X" = "", "\\.PATRIC$" = ""))) @@ -409,7 +409,7 @@ #' @return Invisibly returns TRUE. #' #' @keywords internal -.panaroo2OtherTables <- function(panaroo_output_path, duckdb_path){ +.panaroo2OtherTables <- function(panaroo_output_path, duckdb_path) { panaroo_output_path <- normalizePath(panaroo_output_path) duckdb_path <- normalizePath(duckdb_path) fasta_filepath <- file.path(panaroo_output_path, "pan_genome_reference.fa") @@ -418,15 +418,19 @@ gene_fasta <- Biostrings::readDNAStringSet(filepath = fasta_filepath) DBI::dbWriteTable(con, "gene_ref_seq", - tibble::tibble(name = names(gene_fasta), - sequence = as.character(gene_fasta)), - overwrite = TRUE) + tibble::tibble( + name = names(gene_fasta), + sequence = as.character(gene_fasta) + ), + overwrite = TRUE + ) readr::read_csv(file.path(panaroo_output_path, "gene_presence_absence.csv")) |> dplyr::select(-`Non-unique Gene name`) |> tidyr::pivot_longer(-c("Gene", "Annotation"), - names_to = "genome_ids", - values_to = "protein_ids") |> + names_to = "genome_ids", + values_to = "protein_ids" + ) |> dplyr::mutate(genome_ids = gsub(".PATRIC", "", genome_ids)) |> dplyr::select(genome_ids, Gene, protein_ids) |> dplyr::distinct() |> @@ -447,9 +451,9 @@ #' @return Invisibly returns TRUE. #' #' @keywords internal -.panaroo2duckdb <- function(panaroo_output_path, duckdb_path){ +.panaroo2duckdb <- function(panaroo_output_path, duckdb_path) { panaroo_output_path <- normalizePath(panaroo_output_path) - duckdb_path <- normalizePath(duckdb_path) + duckdb_path <- normalizePath(duckdb_path) .panaroo2geneTable(panaroo_output_path, duckdb_path) .panaroo2geneNames(panaroo_output_path, duckdb_path) @@ -484,7 +488,6 @@ threads = 0, memory = 0, extra_args = c("-g", "1")) { - # Fail fast if Docker is missing if (!nzchar(Sys.which("docker"))) { stop("Docker is not available on your PATH but is required to run CD-HIT.") @@ -530,7 +533,7 @@ "weizhongli1987/cdhit:4.8.1", "cd-hit", "-i", .to_container(cdhit_input_faa, mount_host, mount_cont), - "-o", .to_container(clustered_faa, mount_host, mount_cont), + "-o", .to_container(clustered_faa, mount_host, mount_cont), "-c", as.character(identity), "-n", as.character(word_length), "-T", as.character(threads), @@ -540,19 +543,24 @@ ) message("Running cd-hit via Docker...") - output <- tryCatch({ - system2("docker", args = cmd_args, stdout = TRUE, stderr = TRUE) - }, error = function(e) { - stop("cd-hit execution failed: ", e$message) - }) + output <- tryCatch( + { + system2("docker", args = cmd_args, stdout = TRUE, stderr = TRUE) + }, + error = function(e) { + stop("cd-hit execution failed: ", e$message) + } + ) if (!file.exists(clustered_faa)) { stop("cd-hit failed: output file not found. Check stderr:\n", paste(output, collapse = "\n")) } # Ensure .clstr exists (used downstream) if (!file.exists(paste0(clustered_faa, ".clstr"))) { - stop("cd-hit did not produce the expected .clstr file at: ", paste0(clustered_faa, ".clstr"), - "\nFull output:\n", paste(output, collapse = "\n")) + stop( + "cd-hit did not produce the expected .clstr file at: ", paste0(clustered_faa, ".clstr"), + "\nFull output:\n", paste(output, collapse = "\n") + ) } message("cd-hit completed successfully.") @@ -672,14 +680,14 @@ runPanaroo2Duckdb <- function(duckdb_path, if (isTRUE(verbose)) message("Launching Panaroo.") .runPanaroo( - duckdb_path = duckdb_path, - output_path = out_dir, - core_threshold = core_threshold, - len_dif_percent = len_dif_percent, + duckdb_path = duckdb_path, + output_path = out_dir, + core_threshold = core_threshold, + len_dif_percent = len_dif_percent, cluster_threshold = cluster_threshold, family_seq_identity = family_seq_identity, - threads = threads, - split_jobs = split_jobs + threads = threads, + split_jobs = split_jobs ) # Identify Panaroo outputs that contain a final_graph.gml file @@ -730,8 +738,10 @@ runPanaroo2Duckdb <- function(duckdb_path, .parseProteinClusters <- function(clustered_faa) { clstr <- paste0(clustered_faa, ".clstr") if (!file.exists(clstr)) { - stop("CD-HIT cluster file not found: ", clstr, - "\nEnsure .runCDHIT() completed successfully and produced the .clstr file.") + stop( + "CD-HIT cluster file not found: ", clstr, + "\nEnsure .runCDHIT() completed successfully and produced the .clstr file." + ) } lines <- data.table::fread(clstr, sep = "\n", header = FALSE)$V1 @@ -740,7 +750,7 @@ runPanaroo2Duckdb <- function(duckdb_path, for (i in seq_along(cluster_ids)) { start <- cluster_ids[i] + 1 - end <- if (i < length(cluster_ids)) cluster_ids[i + 1] - 1 else length(lines) + end <- if (i < length(cluster_ids)) cluster_ids[i + 1] - 1 else length(lines) cluster_lines <- lines[start:end] # This finds the reference cluster ID and names the cluster with it @@ -752,8 +762,10 @@ runPanaroo2Duckdb <- function(duckdb_path, } # Pull genome IDs - genome_matches <- stringr::str_match(cluster_lines, - "fig\\|([0-9]+\\.[0-9]+)\\.peg\\.[0-9]+")[, 2] + genome_matches <- stringr::str_match( + cluster_lines, + "fig\\|([0-9]+\\.[0-9]+)\\.peg\\.[0-9]+" + )[, 2] genome_matches <- genome_matches[!is.na(genome_matches)] if (length(genome_matches) > 0) { @@ -810,9 +822,9 @@ buildMatrices <- function(cluster_map) .buildProtMatrices(cluster_map) names_faa <- names(cdhit_output_faa) |> tibble::as_tibble() |> dplyr::mutate( - proteinID = stringr::str_extract(value, "^fig\\|[0-9]+\\.[0-9]+\\.peg\\.[0-9]+"), - locus_tag = stringr::str_match(value, "peg\\.[0-9]+\\|([^\\s]+)")[, 2], - proteinName= stringr::str_trim(stringr::str_match(value, "\\|[^\\s]+\\s+(.*?)\\s+\\[")[, 2]) + proteinID = stringr::str_extract(value, "^fig\\|[0-9]+\\.[0-9]+\\.peg\\.[0-9]+"), + locus_tag = stringr::str_match(value, "peg\\.[0-9]+\\|([^\\s]+)")[, 2], + proteinName = stringr::str_trim(stringr::str_match(value, "\\|[^\\s]+\\s+(.*?)\\s+\\[")[, 2]) ) |> dplyr::select(-value) @@ -828,47 +840,47 @@ CDHIT2duckdb <- function(duckdb_path, word_length = 5, threads = 0, memory = 0, - extra_args = c("-g", "1")){ - + extra_args = c("-g", "1")) { duckdb_path <- normalizePath(duckdb_path) con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path) on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE) if (missing(output_path) || output_path %in% c(".", "results", "results/")) { - output_path <- dirname(duckdb_path) # e.g., ./results/ + output_path <- dirname(duckdb_path) # e.g., ./results/ } output_path <- normalizePath(output_path) cdhit_outputs <- .runCDHIT(duckdb_path, - output_path, - output_prefix = output_prefix, - identity = identity, - word_length = word_length, - threads = threads, - memory = memory, - extra_args = extra_args) - - cluster_map <- .parseProteinClusters(cdhit_outputs$clustered_faa) + output_path, + output_prefix = output_prefix, + identity = identity, + word_length = word_length, + threads = threads, + memory = memory, + extra_args = extra_args + ) + + cluster_map <- .parseProteinClusters(cdhit_outputs$clustered_faa) cluster_count <- .buildProtMatrices(cluster_map) DBI::dbWriteTable(con, "protein_count", cluster_count, overwrite = TRUE) cluster_fasta <- cdhit_outputs$cdhit_input_faa - cluster_name <- .clusterNames(cluster_map, cluster_fasta) + cluster_name <- .clusterNames(cluster_map, cluster_fasta) DBI::dbWriteTable(con, "protein_names", cluster_name, overwrite = TRUE) clustered_faa <- Biostrings::readAAStringSet(cdhit_outputs$clustered_faa) DBI::dbWriteTable(con, "protein_cluster_seq", - tibble::tibble( - name = names(clustered_faa) |> stringr::str_extract("fig\\|[0-9]+\\.[0-9]+\\.peg\\.[0-9]+"), - sequence = as.character(clustered_faa) - ), - overwrite = TRUE) + tibble::tibble( + name = names(clustered_faa) |> stringr::str_extract("fig\\|[0-9]+\\.[0-9]+\\.peg\\.[0-9]+"), + sequence = as.character(clustered_faa) + ), + overwrite = TRUE + ) invisible(TRUE) } - #' Check or install InterProScan data bundle #' #' Ensures that the InterProScan data directory exists locally, downloading @@ -885,12 +897,12 @@ CDHIT2duckdb <- function(duckdb_path, #' #' @keywords internal .checkInterProData <- function( - version = "5.76-107.0", - dest_dir = "inst/extdata/interpro", - docker_image = sprintf("interpro/interproscan:%s", version), - platform = "linux/amd64", - curl_bin = "curl", - verbose = TRUE + version = "5.76-107.0", + dest_dir = "inst/extdata/interpro", + docker_image = sprintf("interpro/interproscan:%s", version), + platform = "linux/amd64", + curl_bin = "curl", + verbose = TRUE ) { msg <- function(...) if (verbose) message(sprintf(...)) @@ -907,25 +919,29 @@ CDHIT2duckdb <- function(duckdb_path, } # Download bundle if needed - tar_url <- sprintf("http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/%s/alt/interproscan-data-%s.tar.gz", - version, version) - md5_url <- paste0(tar_url, ".md5") + tar_url <- sprintf( + "http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/%s/alt/interproscan-data-%s.tar.gz", + version, version + ) + md5_url <- paste0(tar_url, ".md5") tar_path <- file.path(dest_dir, basename(tar_url)) md5_path <- paste0(tar_path, ".md5") if (!file.exists(tar_path)) { msg("Downloading InterProScan data bundle.") - status_tar <- system2(curl_bin, c("-L", "-o", tar_path, tar_url)) - status_md5 <- system2(curl_bin, c("-L", "-o", md5_path, md5_url)) - if (status_tar != 0 || status_md5 != 0) + status_tar <- system2(curl_bin, c("-L", "-o", tar_path, tar_url)) + status_md5 <- system2(curl_bin, c("-L", "-o", md5_path, md5_url)) + if (status_tar != 0 || status_md5 != 0) { stop("Failed to download InterProScan data bundle.") + } } msg("Verifying MD5 checksum.") md5_expected <- sub("\\s+.*$", "", readLines(md5_path)[1]) - md5_actual <- tools::md5sum(tar_path)[[1]] - if (!identical(tolower(md5_expected), tolower(md5_actual))) + md5_actual <- tools::md5sum(tar_path)[[1]] + if (!identical(tolower(md5_expected), tolower(md5_actual))) { stop("MD5 checksum mismatch for InterProScan data bundle.") + } msg("Extracting InterProScan data bundle.") utils::untar(tar_path, exdir = dest_dir, tar = "internal") @@ -946,9 +962,11 @@ CDHIT2duckdb <- function(duckdb_path, #' #' @keywords internal .getDfIPRColNames <- function() { - c("AccNum", "SeqMD5Digest", "SLength", "Analysis", + c( + "AccNum", "SeqMD5Digest", "SLength", "Analysis", "DB.ID", "SignDesc", "StartLoc", "StopLoc", "Score", - "Status", "RunDate", "IPRAcc", "IPRDesc", "placeholder") + "Status", "RunDate", "IPRAcc", "IPRDesc", "placeholder" + ) } #' Internal helpers for reading InterProScan TSV outputs @@ -993,8 +1011,9 @@ CDHIT2duckdb <- function(duckdb_path, #' @keywords internal .readIPRscanTsv <- function(filepath) { readr::read_tsv(filepath, - col_types = .getDfIPRColTypes(), - col_names = .getDfIPRColNames()) + col_types = .getDfIPRColTypes(), + col_names = .getDfIPRColNames() + ) } @@ -1026,7 +1045,7 @@ CDHIT2duckdb <- function(duckdb_path, file_format, docker_image = sprintf("interpro/interproscan:%s", "5.76-107.0")) { # Normalize and mount paths - path <- .docker_path(path) + path <- .docker_path(path) bind_data <- .docker_path(ipr_data_path) dir.create(file.path(path, "tmp", "iprscan"), recursive = TRUE, showWarnings = FALSE) @@ -1050,7 +1069,7 @@ CDHIT2duckdb <- function(duckdb_path, "-v", paste0(bind_data, ":/opt/interproscan/data"), "-w", "/work", docker_image, - "--input", .to_container(temp_fasta_file, path, "/work"), + "--input", .to_container(temp_fasta_file, path, "/work"), "--cpu", as.character(threads), "-f", file_format, "--appl", appl_str, @@ -1058,31 +1077,34 @@ CDHIT2duckdb <- function(duckdb_path, ) - status <- tryCatch({ - system2( - "docker", - args = c( - "run", - "--rm", - "--platform", "linux/amd64", # force amd64 for ARM hosts - "-v", paste0(path, ":", "/work"), - "-v", paste0(bind_data, ":/opt/interproscan/data"), - "-w", "/work", - docker_image, - "--input", .to_container(temp_fasta_file, path, "/work"), - "--cpu", as.character(threads), - "-f", file_format, - "--appl", appl_str, - "-b", chunk_out_file_base_cont - ), - stdout = TRUE, - stderr = TRUE - ) - }, error = function(e) { - stop(sprintf("InterProScan execution failed for chunk %d: %s", chunk_id, e$message)) - }) + status <- tryCatch( + { + system2( + "docker", + args = c( + "run", + "--rm", + "--platform", "linux/amd64", # force amd64 for ARM hosts + "-v", paste0(path, ":", "/work"), + "-v", paste0(bind_data, ":/opt/interproscan/data"), + "-w", "/work", + docker_image, + "--input", .to_container(temp_fasta_file, path, "/work"), + "--cpu", as.character(threads), + "-f", file_format, + "--appl", appl_str, + "-b", chunk_out_file_base_cont + ), + stdout = TRUE, + stderr = TRUE + ) + }, + error = function(e) { + stop(sprintf("InterProScan execution failed for chunk %d: %s", chunk_id, e$message)) + } + ) - out_tsv <- paste0(chunk_out_file_base_host, ".tsv") + out_tsv <- paste0(chunk_out_file_base_host, ".tsv") out_tsvgz <- paste0(chunk_out_file_base_host, ".tsv.gz") if (file.exists(out_tsv)) { @@ -1090,8 +1112,10 @@ CDHIT2duckdb <- function(duckdb_path, } else if (file.exists(out_tsvgz)) { return(out_tsvgz) } else { - stop(sprintf("InterProScan produced no output for chunk %d. Checked: %s and %s.\nLast message:\n%s", - chunk_id, out_tsv, out_tsvgz, paste(status, collapse = "\n"))) + stop(sprintf( + "InterProScan produced no output for chunk %d. Checked: %s and %s.\nLast message:\n%s", + chunk_id, out_tsv, out_tsvgz, paste(status, collapse = "\n") + )) } } @@ -1100,14 +1124,13 @@ domainFromIPR <- function(duckdb_path, path, out_file_base = "iprscan", appl = c("Pfam"), - ipr_version = "5.76-107.0", + ipr_version = "5.76-107.0", ipr_dest_dir = "inst/extdata/interpro", ipr_platform = "linux/amd64", auto_prepare_data = TRUE, threads = 8, file_format = "TSV", docker_repo = "interpro/interproscan") { - duckdb_path <- normalizePath(duckdb_path) if (missing(path) || path %in% c(".", "results", "results/")) { path <- dirname(duckdb_path) @@ -1126,8 +1149,10 @@ domainFromIPR <- function(duckdb_path, verbose = TRUE ) } else { - list(data_dir = file.path(ipr_dest_dir, sprintf("interproscan-%s", ipr_version), "data"), - ready = NA) + list( + data_dir = file.path(ipr_dest_dir, sprintf("interproscan-%s", ipr_version), "data"), + ready = NA + ) } ipr_data_path <- ipr_info$data_dir @@ -1138,11 +1163,12 @@ domainFromIPR <- function(duckdb_path, on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE) sequences_df <- dplyr::tbl(con, "protein_cluster_seq") |> tibble::as_tibble() - if (nrow(sequences_df) == 0L) + if (nrow(sequences_df) == 0L) { stop("No sequences found in 'protein_cluster_seq'. Please run CDHIT2duckdb() first.") + } # Chunking for parallel (not currently implemented due to memory limits) - chunks <- list(sequences_df) # Force 1 chunk for RAM limits + chunks <- list(sequences_df) # Force 1 chunk for RAM limits # Forcing 1 container operation for RAM limits workers <- 1 @@ -1179,24 +1205,28 @@ domainFromIPR <- function(duckdb_path, # Combine results tsvs <- Filter(function(x) !is.null(x) && file.exists(x), results) - if (length(tsvs) == 0L) + if (length(tsvs) == 0L) { stop("InterProScan produced no usable outputs. Check Docker logs above.") + } df_iprscan <- do.call(rbind, lapply(tsvs, .readIPRscanTsv)) # Load processed tables (unchanged) DBI::dbWriteTable(con, "domain_names", - df_iprscan |> - dplyr::select(AccNum, DB.ID, SignDesc, IPRAcc, IPRDesc, StartLoc, StopLoc), - overwrite = TRUE) + df_iprscan |> + dplyr::select(AccNum, DB.ID, SignDesc, IPRAcc, IPRDesc, StartLoc, StopLoc), + overwrite = TRUE + ) df_protein_domain_pa <- df_iprscan |> dplyr::select(AccNum, DB.ID, IPRAcc, placeholder) |> dplyr::mutate(domain_ID = stringr::str_glue("{DB.ID}_{IPRAcc}")) |> dplyr::distinct() |> dplyr::mutate(placeholder = stringr::str_replace_all(placeholder, "-", "1")) |> - tidyr::pivot_wider(id_cols = AccNum, names_from = domain_ID, values_from = placeholder, - values_fill = "0") |> + tidyr::pivot_wider( + id_cols = AccNum, names_from = domain_ID, values_from = placeholder, + values_fill = "0" + ) |> dplyr::group_by(AccNum) |> dplyr::summarize(across(everything(), ~ ifelse(any(. == "1"), "1", "0")), .groups = "drop") |> dplyr::mutate(across(-AccNum, as.numeric)) @@ -1204,8 +1234,9 @@ domainFromIPR <- function(duckdb_path, protein_filter <- dplyr::tbl(con, "protein_count") |> tibble::as_tibble() accs <- unique(df_protein_domain_pa$AccNum) accs_in_matrix <- intersect(accs, colnames(protein_filter)) - if (length(accs_in_matrix) == 0L) + if (length(accs_in_matrix) == 0L) { stop("No InterPro accessions match protein_count columns.") + } protein_filter <- protein_filter |> dplyr::select(genome_id, dplyr::all_of(accs_in_matrix)) df_protein_domain_pa <- df_protein_domain_pa |> @@ -1224,16 +1255,16 @@ domainFromIPR <- function(duckdb_path, # Clean BV-BRC metadata, then save as Parquet files #' -#' @param duckdb_path -#' @param path -#' @param ref_file_path +#' @param duckdb_path +#' @param path +#' @param ref_file_path #' #' @returns #' @export #' #' @examples -cleanMetaData <- function(duckdb_path, path, ref_file_path = "data_raw/"){ - duckdb_path <- normalizePath(duckdb_path) +cleanMetaData <- function(duckdb_path, path, ref_file_path = "data_raw/") { + duckdb_path <- normalizePath(duckdb_path) # If no explicit path is provided (or a generic one), choose results// when # the DuckDB lives under data//, or else fall back to the DuckDB directory. if (missing(path) || path %in% c(".", "results", "results/")) { @@ -1246,54 +1277,56 @@ cleanMetaData <- function(duckdb_path, path, ref_file_path = "data_raw/"){ ) path <- if (!identical(mapped_results, bug_dir)) mapped_results else bug_dir } - + path <- normalizePath(path, mustWork = FALSE) if (!dir.exists(path)) dir.create(path, recursive = TRUE) - + con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path) on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE) ref_file_path <- normalizePath(ref_file_path) - + clean_drug <- readr::read_tsv(file.path(ref_file_path, "clean_drug.tsv")) drug_class <- readr::read_tsv(file.path(ref_file_path, "drug_class.tsv")) - drug_abbr <- readr::read_tsv(file.path(ref_file_path, "drug_abbr.tsv")) + drug_abbr <- readr::read_tsv(file.path(ref_file_path, "drug_abbr.tsv")) class_abbr <- readr::read_tsv(file.path(ref_file_path, "class_abbr.tsv")) clean_countries <- readr::read_tsv(file.path(ref_file_path, "cleaned_bvbrc_countries.tsv")) |> - dplyr::select("raw_entry", "clean_name", "short_name")|> + dplyr::select("raw_entry", "clean_name", "short_name") |> dplyr::distinct() - + dplyr::tbl(con, "filtered") |> tibble::as_tibble() |> - dplyr::select("genome_drug.genome_id", "genome_drug.antibiotic", - "genome_drug.genome_name", "genome_drug.laboratory_typing_method", - "genome_drug.resistant_phenotype", "genome_drug.taxon_id", - "genome_drug.pmid", "genome.collection_year", - "genome.isolation_country", "genome.host_common_name", - "genome.isolation_source", "genome.species") |> + dplyr::select( + "genome_drug.genome_id", "genome_drug.antibiotic", + "genome_drug.genome_name", "genome_drug.laboratory_typing_method", + "genome_drug.resistant_phenotype", "genome_drug.taxon_id", + "genome_drug.pmid", "genome.collection_year", + "genome.isolation_country", "genome.host_common_name", + "genome.isolation_source", "genome.species" + ) |> dplyr::left_join(clean_drug, by = c("genome_drug.antibiotic" = "original_drug")) |> dplyr::filter(!is.na(cleaned_drug)) |> dplyr::left_join(drug_class, by = c("cleaned_drug" = "drug")) |> dplyr::left_join(drug_abbr, by = c("cleaned_drug" = "drug")) |> dplyr::left_join(class_abbr, by = "drug_class") |> DBI::dbWriteTable(conn = con, name = "filtered", overwrite = TRUE) - + resistance_summary <- dplyr::tbl(con, "filtered") |> - tibble::as_tibble() |> + tibble::as_tibble() |> dplyr::filter(genome_drug.resistant_phenotype == "Resistant") |> dplyr::group_by(genome_drug.genome_id) |> dplyr::summarise( num_resistant_classes = dplyr::n_distinct(drug_class), resistant_classes = paste(unique(class_abbr), collapse = "_") ) - + year_breaks <- seq(1980, 2023, by = 5) - + dplyr::tbl(con, "filtered") |> tibble::as_tibble() |> dplyr::mutate(genome_drug.antibiotic = cleaned_drug) |> dplyr::select(-cleaned_drug) |> dplyr::left_join(clean_countries, by = c("genome.isolation_country" = "raw_entry")) |> - dplyr::rename("cleaned_country"="clean_name", "country_abbr"="short_name") |> + dplyr::rename("cleaned_country" = "clean_name", "country_abbr" = "short_name") |> dplyr::mutate(genome.isolation_country = cleaned_country) |> dplyr::select(-cleaned_country) |> dplyr::left_join(resistance_summary, by = "genome_drug.genome_id") |> @@ -1301,25 +1334,29 @@ cleanMetaData <- function(duckdb_path, path, ref_file_path = "data_raw/"){ is.na(resistant_classes) ~ genome_drug.resistant_phenotype, TRUE ~ resistant_classes )) |> - dplyr::mutate(num_resistant_classes= dplyr::case_when( + dplyr::mutate(num_resistant_classes = dplyr::case_when( is.na(num_resistant_classes) ~ 0, TRUE ~ num_resistant_classes )) |> dplyr::mutate(genome.collection_year = as.numeric(genome.collection_year)) |> - dplyr::mutate(year_bin = cut(genome.collection_year, breaks = year_breaks, - right = FALSE, include.lowest = TRUE, - labels = paste(year_breaks[-length(year_breaks)], - year_breaks[-1] - 1, sep = "-"))) |> + dplyr::mutate(year_bin = cut(genome.collection_year, + breaks = year_breaks, + right = FALSE, include.lowest = TRUE, + labels = paste(year_breaks[-length(year_breaks)], + year_breaks[-1] - 1, + sep = "-" + ) + )) |> DBI::dbWriteTable(conn = con, name = "cleaned_metadata", overwrite = TRUE) - + # Parquet output path - metadata_parquet <- file.path(path, "metadata.parquet") # cleaned_metadata exported as 'metadata' + metadata_parquet <- file.path(path, "metadata.parquet") # cleaned_metadata exported as 'metadata' # Also export AMR/genome/original metadata - amr_phenotype_parquet <- file.path(path, "amr_phenotype.parquet") - genome_data_parquet <- file.path(path, "genome_data.parquet") - original_metadata_parquet <- file.path(path, "original_metadata.parquet") - + amr_phenotype_parquet <- file.path(path, "amr_phenotype.parquet") + genome_data_parquet <- file.path(path, "genome_data.parquet") + original_metadata_parquet <- file.path(path, "original_metadata.parquet") + writeCompressedParquet <- function(df, path) { arrow::write_parquet( df, @@ -1329,38 +1366,40 @@ cleanMetaData <- function(duckdb_path, path, ref_file_path = "data_raw/"){ use_dictionary = TRUE ) } - - db_name <- duckdb_path |> stringr::str_split_i(".duckdb", i = 1) |> paste0("_parquet.duckdb") + + db_name <- duckdb_path |> + stringr::str_split_i(".duckdb", i = 1) |> + paste0("_parquet.duckdb") con_new <- DBI::dbConnect(duckdb::duckdb(), db_name) on.exit(try(DBI::dbDisconnect(con_new, shutdown = FALSE), silent = TRUE), add = TRUE) - + # cleaned_metadata -> parquet + view (as metadata) DBI::dbReadTable(con, "cleaned_metadata") |> writeCompressedParquet(metadata_parquet) DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW metadata AS SELECT * FROM read_parquet('%s')", metadata_parquet)) - + # debug/complete views: amr_phenotype, genome_data, original_metadata DBI::dbReadTable(con, "amr_phenotype") |> writeCompressedParquet(amr_phenotype_parquet) - DBI::dbReadTable(con, "genome_data") |> writeCompressedParquet(genome_data_parquet) - DBI::dbReadTable(con, "metadata") |> writeCompressedParquet(original_metadata_parquet) - + DBI::dbReadTable(con, "genome_data") |> writeCompressedParquet(genome_data_parquet) + DBI::dbReadTable(con, "metadata") |> writeCompressedParquet(original_metadata_parquet) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW amr_phenotype AS SELECT * FROM read_parquet('%s')", amr_phenotype_parquet)) DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW genome_data AS SELECT * FROM read_parquet('%s')", genome_data_parquet)) DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW original_metadata AS SELECT * FROM read_parquet('%s')", original_metadata_parquet)) - + invisible(TRUE) } # Clean feature matrices, then save as Parquet files #' -#' @param duckdb_path -#' @param path +#' @param duckdb_path +#' @param path #' #' @returns #' @export #' #' @examples -cleanData <- function(duckdb_path, path){ - duckdb_path <- normalizePath(duckdb_path) +cleanData <- function(duckdb_path, path) { + duckdb_path <- normalizePath(duckdb_path) # If no explicit path is provided (or a generic one), choose results// when # the DuckDB lives under data//, or else fall back to the DuckDB directory. if (missing(path) || path %in% c(".", "results", "results/")) { @@ -1381,19 +1420,19 @@ cleanData <- function(duckdb_path, path){ on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE) # Parquet output paths - genes_parquet <- file.path(path, "gene_count.parquet") - gene_names_parquet <- file.path(path, "gene_names.parquet") - gene_ref_seq_parquet <- file.path(path, "gene_seqs.parquet") - genome_gene_protein_parquet <- file.path(path, "genome_gene_protein.parquet") - struct_parquet <- file.path(path, "struct.parquet") + genes_parquet <- file.path(path, "gene_count.parquet") + gene_names_parquet <- file.path(path, "gene_names.parquet") + gene_ref_seq_parquet <- file.path(path, "gene_seqs.parquet") + genome_gene_protein_parquet <- file.path(path, "genome_gene_protein.parquet") + struct_parquet <- file.path(path, "struct.parquet") - proteins_parquet <- file.path(path, "protein_count.parquet") - domains_parquet <- file.path(path, "domain_count.parquet") + proteins_parquet <- file.path(path, "protein_count.parquet") + domains_parquet <- file.path(path, "domain_count.parquet") - domain_names_parquet <- file.path(path, "domain_names.parquet") - protein_names_parquet <- file.path(path, "protein_names.parquet") + domain_names_parquet <- file.path(path, "domain_names.parquet") + protein_names_parquet <- file.path(path, "protein_names.parquet") - protein_cluster_seq_parquet <- file.path(path, "protein_seqs.parquet") + protein_cluster_seq_parquet <- file.path(path, "protein_seqs.parquet") writeCompressedParquet <- function(df, path) { arrow::write_parquet( @@ -1405,7 +1444,9 @@ cleanData <- function(duckdb_path, path){ ) } - db_name <- duckdb_path |> stringr::str_split_i(".duckdb", i = 1) |> paste0("_parquet.duckdb") + db_name <- duckdb_path |> + stringr::str_split_i(".duckdb", i = 1) |> + paste0("_parquet.duckdb") con_new <- DBI::dbConnect(duckdb::duckdb(), db_name) on.exit(try(DBI::dbDisconnect(con_new, shutdown = FALSE), silent = TRUE), add = TRUE) @@ -1623,7 +1664,7 @@ runDataProcessing <- function(duckdb_path, cdhit_identity = 0.9, cdhit_word_length = 5, cdhit_memory = 0, - cdhit_extra_args = c("-g","1"), + cdhit_extra_args = c("-g", "1"), cdhit_output_prefix = "cdhit_out", # InterPro ipr_appl = c("Pfam"), @@ -1709,4 +1750,3 @@ runDataProcessing <- function(duckdb_path, parquet_duckdb_path = normalizePath(parquet_duckdb_path) )) } - diff --git a/vignettes/intro.Rmd b/vignettes/intro.Rmd index 59ccbc5..9d9a36f 100644 --- a/vignettes/intro.Rmd +++ b/vignettes/intro.Rmd @@ -31,13 +31,15 @@ For metadata curation with `amRdata`, use `retrieveMetadata()` to create a DuckD ```{r} # Download all AMR data for a species from BV-BRC -retrieveMetadata(user_bacs = "Shigella flexneri", - filter_type = "AMR", - base_dir = "../data/", - abx = "All", - overwrite = FALSE, - image = "danylmb/bvbrc:5.3", - verbose = FALSE) +retrieveMetadata( + user_bacs = "Shigella flexneri", + filter_type = "AMR", + base_dir = "../data/", + abx = "All", + overwrite = FALSE, + image = "danylmb/bvbrc:5.3", + verbose = FALSE +) ``` This wrote tables 'amr_phenotype', 'genome_data', and 'metadata' to a DuckDB @@ -45,16 +47,18 @@ This wrote tables 'amr_phenotype', 'genome_data', and 'metadata' to a DuckDB For downloading genomes with paired AMR phenotype data, use `retrieveGenomes()` ```{r } -retrieveGenomes(base_dir = "../data/", - user_bacs = "Shigella flexneri", - method = c("cli"), - image = "danylmb/bvbrc:5.3", - skip_existing = TRUE, - ftp_workers = 8L, - cli_fasta_workers = 4L, - cli_gff_workers = 4L, - chunk_size = 50L, - verbose = TRUE) +retrieveGenomes( + base_dir = "../data/", + user_bacs = "Shigella flexneri", + method = c("cli"), + image = "danylmb/bvbrc:5.3", + skip_existing = TRUE, + ftp_workers = 8L, + cli_fasta_workers = 4L, + cli_gff_workers = 4L, + chunk_size = 50L, + verbose = TRUE +) ``` This returns character vector of genome IDs and wrote complete file sets on disk. @@ -62,10 +66,11 @@ This returns character vector of genome IDs and wrote complete file sets on disk To write a .txt file listing downloaded genome filepaths (.fna, .faa, .gff) ```{r } -genomeList(base_dir = "../data/", - user_bacs = "Shigella flexneri", - verbose = TRUE) - +genomeList( + base_dir = "../data/", + user_bacs = "Shigella flexneri", + verbose = TRUE +) ``` #### A wrapper for downloading genomes and listing the paths @@ -77,11 +82,13 @@ Internally runs: Allows users to input a species or taxon ID and automate all data downloading and curation steps. ```{r } -prepareGenomes(user_bacs = "Shigella flexneri", - base_dir = "../data/", - method = c("cli"), - overwrite = FALSE, - verbose = TRUE) +prepareGenomes( + user_bacs = "Shigella flexneri", + base_dir = "../data/", + method = c("cli"), + overwrite = FALSE, + verbose = TRUE +) ``` ## Data Processing @@ -95,32 +102,34 @@ Internally runs: 4. `cleanData()` -\> Clean metadata and export the feature tables to Parquet + Parquet-backed DuckDB ```{r} -runDataProcessing(duckdb_path = "../data/Shigella_flexneri/Sfl.duckdb", - output_path = NULL, - # unified threads for all tools - threads = 16, - # Panaroo - panaroo_split_jobs = FALSE, - panaroo_core_threshold = 0.90, - panaroo_len_dif_percent = 0.95, - panaroo_cluster_threshold = 0.95, - panaroo_family_seq_identity = 0.5, - # CD-HIT - cdhit_identity = 0.9, - cdhit_word_length = 5, - cdhit_memory = 0, - cdhit_extra_args = c("-g","1"), - cdhit_output_prefix = "cdhit_out", - # InterPro - ipr_appl = c("Pfam"), - ipr_threads_unused = NULL, - ipr_version = "5.76-107.0", - ipr_dest_dir = "inst/extdata/interpro", - ipr_platform = "linux/amd64", - auto_prepare_data = TRUE, - # Metadata cleaning - ref_file_path = "../data_raw/", - verbose = TRUE) +runDataProcessing( + duckdb_path = "../data/Shigella_flexneri/Sfl.duckdb", + output_path = NULL, + # unified threads for all tools + threads = 16, + # Panaroo + panaroo_split_jobs = FALSE, + panaroo_core_threshold = 0.90, + panaroo_len_dif_percent = 0.95, + panaroo_cluster_threshold = 0.95, + panaroo_family_seq_identity = 0.5, + # CD-HIT + cdhit_identity = 0.9, + cdhit_word_length = 5, + cdhit_memory = 0, + cdhit_extra_args = c("-g", "1"), + cdhit_output_prefix = "cdhit_out", + # InterPro + ipr_appl = c("Pfam"), + ipr_threads_unused = NULL, + ipr_version = "5.76-107.0", + ipr_dest_dir = "inst/extdata/interpro", + ipr_platform = "linux/amd64", + auto_prepare_data = TRUE, + # Metadata cleaning + ref_file_path = "../data_raw/", + verbose = TRUE +) ``` ## Plots @@ -128,8 +137,10 @@ runDataProcessing(duckdb_path = "../data/Shigella_flexneri/Sfl.duckdb", Simple stats and plots to explore metadata ```{r } -generateSummary("data/metadata_parquet", - out_path = "data/") -generatePlots("data/metadata_parquet", - out_path = "data/") +generateSummary("data/metadata_parquet", + out_path = "data/" +) +generatePlots("data/metadata_parquet", + out_path = "data/" +) ``` From 4edc964d748d09cb7cbf911fd2200db44865c230 Mon Sep 17 00:00:00 2001 From: Evan Pierce Brenner <108823789+epbrenner@users.noreply.github.com> Date: Wed, 25 Feb 2026 15:15:23 -0700 Subject: [PATCH 3/4] Updating download logic Fixed trailing zero bug, fixed FTP timeout bug (?), fixed empty files hanging downloads, fixed imbalanced genome data sets (e.g., no .fna, yes .faa, yes .gff) --- R/data_curation.R | 816 +++++++++++++++++++++++++++++----------------- 1 file changed, 511 insertions(+), 305 deletions(-) diff --git a/R/data_curation.R b/R/data_curation.R index 0d19113..ac5095b 100644 --- a/R/data_curation.R +++ b/R/data_curation.R @@ -1,3 +1,166 @@ +#' Helps ensure trailing 0s are retained in genome IDs for proper downloading +#' @keywords internal +.id_checker <- function(x) { + # Taxon IDs are just numbers, genome IDs have decimals, this tells them apart + grepl("^[0-9]+$", x) +} + +#' Helps tag genomes with their AMR evidence for parsing +#' @keywords internal +.create_amr_tagged_view <- function(con) { + lab_methods <- c("Disk diffusion", "MIC", "Broth dilution", "Agar dilution") + lab_list_sql <- paste(DBI::dbQuoteString(con, lab_methods), collapse = ", ") + comp_str_sql <- DBI::dbQuoteString(con, "Computational Method") + + DBI::dbExecute( + con, + sprintf( + "CREATE OR REPLACE VIEW amr_phenotype_tagged AS + SELECT + *, + CASE WHEN \"genome_drug.laboratory_typing_method\" IN (%s) THEN 1 ELSE 0 END AS is_lab_row, + CASE WHEN + (\"genome_drug.evidence\" = %s) + OR (COALESCE(\"genome_drug.computational_method\", '') <> '') + OR (\"genome_drug.laboratory_typing_method\" = 'Computational Prediction') + THEN 1 ELSE 0 END AS is_comp_row + FROM amr_phenotype", + lab_list_sql, comp_str_sql + ) + ) +} + +#' Helps appropriately interface with BV-BRC FTPS server, and avoids getting stuck +#' when malformed files can hang an FTPS connection by introducing safeguards +#' @keywords internal +.ftpes_download_one <- function(genomeID, out_dir, + connect_timeout = 10L, + max_time = 30L, + speed_time = 30L, # end if speed_time + speed_limit = 2048L, # B/s + min_bytes = 100L) { + + dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) + + exts <- c(".fna", ".PATRIC.faa", ".PATRIC.gff") + for (ext in exts) { + dest <- file.path(out_dir, paste0(genomeID, ext)) + dest_tmp <- paste0(dest, ".tmp") + + # Skip if we already completed these + if (file.exists(dest) && file.info(dest)$size > min_bytes) next + if (file.exists(dest) && file.info(dest)$size == 0) try(unlink(dest), silent = TRUE) + if (file.exists(dest_tmp)) try(unlink(dest_tmp), silent = TRUE) + + url <- sprintf("ftp://ftp.bv-brc.org/genomes/%s/%s%s", genomeID, genomeID, ext) + + args <- c( + "--fail", "--silent", "--show-error", "-L", + "--connect-timeout", as.character(connect_timeout), + "--max-time", as.character(max_time), + "--speed-time", as.character(speed_time), + "--speed-limit", as.character(speed_limit), + "--ftp-ssl", # AUTH TLS on port 21 works in testing + "--ftp-pasv", "--disable-epsv", "--ipv4", + "--user", "anonymous:", + "-o", shQuote(dest_tmp), shQuote(url) + ) + + res <- suppressWarnings(system2("curl", args = args, stdout = TRUE, stderr = TRUE)) + status <- attr(res, "status"); if (is.null(status)) status <- 0L + + # Avoiding 0B files cluttering up results -- atomic rename on successful tmp DL + ok <- (status == 0L && file.exists(dest_tmp) && file.info(dest_tmp)$size > min_bytes) + if (ok) { + if (!file.rename(dest_tmp, dest)) { + # If rename fails for some reason, copy + unlink + file.copy(dest_tmp, dest, overwrite = TRUE) + unlink(dest_tmp) + } + } else { + try(unlink(dest_tmp), silent = TRUE) + return(FALSE) # Abort early, don't accept partial set + } + } + + # Do we have all 3 present? + .is_complete_set(out_dir, genomeID, min_bytes = min_bytes) +} + +#' Helps manage FTPS downloading from BV-BRC, tryng a quick download first, and +#' if that fails, trying a longer timeout 2nd pass at the end in case it was a +#' hiccup. If 2nd pass fails, log and give up on that file. +#' @keywords internal +.ftpes_download_two_pass <- function(genome_ids, out_dir, + workers_first = 4L, + workers_second = 4L, + log_file = NULL) { + genome_ids <- unique(as.character(genome_ids)) + if (!length(genome_ids)) return(character(0)) + + if (!is.null(log_file)) { + dir.create(dirname(log_file), recursive = TRUE, showWarnings = FALSE) + cat(sprintf("[%s] FTPS run start: %d genomes\n", Sys.time(), length(genome_ids)), + file = log_file, append = TRUE) + } + + # Pass 1: 30s per-file cap + message("FTPS pass 1 (30s timeout)") + future::plan(future::multisession, workers = max(1, workers_first)) + res1 <- future.apply::future_lapply( + genome_ids, + function(gid) { + ok <- .ftpes_download_one(gid, out_dir, + connect_timeout = 10L, max_time = 30L, + speed_time = 30L, speed_limit = 2048L) + list(gid = gid, ok = ok) + }, + future.seed = TRUE + ) + ok1 <- vapply(res1, `[[`, logical(1), "ok") + ok_ids_1 <- genome_ids[ok1] + fail_ids <- genome_ids[!ok1] + message(sprintf("Pass 1: ok=%d, fail=%d", length(ok_ids_1), length(fail_ids))) + if (!is.null(log_file)) { + cat(sprintf("[%s] Pass1 ok=%d fail=%d\n", Sys.time(), length(ok_ids_1), length(fail_ids)), + file = log_file, append = TRUE) + } + + if (!length(fail_ids)) { + if (!is.null(log_file)) cat(sprintf("[%s] FTPS run end: all OK\n", Sys.time()), + file = log_file, append = TRUE) + return(ok_ids_1) + } + + # Pass 2: 60s per-file cap where we retry any failures + message("FTPS pass 2 (60s timeout) for failed genomes") + future::plan(future::multisession, workers = max(1, workers_second)) + res2 <- future.apply::future_lapply( + fail_ids, + function(gid) { + ok <- .ftpes_download_one(gid, out_dir, + connect_timeout = 10L, max_time = 60L, + speed_time = 30L, speed_limit = 2048L) + list(gid = gid, ok = ok) + }, + future.seed = TRUE + ) + ok2 <- vapply(res2, `[[`, logical(1), "ok") + ok_ids_2 <- fail_ids[ok2] + still_fail <- setdiff(fail_ids, ok_ids_2) + message(sprintf("Pass 2: ok=%d, still_fail=%d", length(ok_ids_2), length(still_fail))) + if (!is.null(log_file)) { + cat(sprintf("[%s] Pass2 ok=%d still_fail=%d\n", Sys.time(), length(ok_ids_2), length(still_fail)), + file = log_file, append = TRUE) + if (length(still_fail)) { + cat("Fail IDs (excluded): ", paste(head(still_fail, 50), collapse = ", "), "\n", + file = log_file, append = TRUE) + } + } + + unique(c(ok_ids_1, ok_ids_2)) +} + #' Fetch BV-BRC bacterial genome data #' #' Run the BV-BRC CLI (`p3-all-genomes`) inside a Docker image and return results as a tibble. @@ -26,7 +189,8 @@ "--in superkingdom,Bacteria ", "--eq genome_quality,Good ", "--in genome_status,WGS,Complete ", - "--attr genome_id,genome_name,taxon_id,species,strain,_version_,", + "--attr genome_id,genome_name,genome_quality,genome_status,", + "taxon_id,species,strain,_version_,", "collection_year,state_province,latitude,longitude,", "antimicrobial_resistance_evidence,assembly_accession,isolation_country,", "isolation_source,disease,host_common_name" @@ -44,10 +208,10 @@ stop("BV-BRC returned no data. The service may be unavailable.") } - # Parse + # Parse and coerce to character to avoid numeric interpretation losing trailing 0s df <- utils::read.table( text = raw_data, sep = "\t", header = TRUE, fill = TRUE, - quote = "", check.names = FALSE, comment.char = "" + quote = "", check.names = FALSE, comment.char = "", colClasses = "character" ) df <- tibble::as_tibble(df) |> dplyr::mutate(dplyr::across(dplyr::everything(), ~ iconv(.x, from = "", to = "UTF-8", sub = ""))) @@ -230,7 +394,7 @@ ) for (user_bac in user_bacs) { - if (suppressWarnings(!is.na(as.numeric(user_bac)))) { + if (.id_checker(user_bac)) { message("Numeric input detected: ", user_bac) if (user_bac %in% bvbrc_bacs$genome.taxon_id) { bac_name <- bvbrc_bacs$genome.species[bvbrc_bacs$genome.taxon_id == user_bac] @@ -294,10 +458,11 @@ return(cand) } if (query_type == "taxon_id") { - nums <- suppressWarnings(!is.na(as.numeric(user_bacs))) + nums <- .id_checker(user_bacs) if (!any(nums)) stop("Cannot infer taxon_id from user_bacs. Provide query_value.") return(as.character(user_bacs[which(nums)[1]])) } + stop("Unsupported query_type: ", query_type) } @@ -318,12 +483,12 @@ #' @examples #' .generateDBname(c("90371", "Bacillus subtilis")) #' .generateDBname(c("12345", "Escherichia coli", "Lactobacillus")) -#' +#' @keywords internal .generateDBname <- function(user_bacs) { db_parts <- c() for (user_bac in user_bacs) { - if (suppressWarnings(!is.na(as.numeric(user_bac)))) { + if (.id_checker(user_bac)) { db_parts <- c(db_parts, paste0("tid_", user_bac)) } else { parts <- stringr::str_split(user_bac, " ")[[1]] @@ -437,7 +602,7 @@ data_result <- tibble::as_tibble( utils::read.table( text = data_raw, sep = "\t", header = TRUE, fill = TRUE, - quote = "", check.names = FALSE, comment.char = "" + quote = "", check.names = FALSE, comment.char = "", colClasses = "character" ) ) |> dplyr::mutate( @@ -462,17 +627,11 @@ #' Retrieve genome IDs for each taxon via BV-BRC and DuckDB #' -#' Resolves user-provided taxa to taxon IDs, queries BV-BRC per unique taxon ID, -#' and returns distinct genome IDs. Uses a per-selection DuckDB located under: -#' /data//.duckdb -#' BV-BRC column names are preserved. -#' -#' @param base_dir Character. Project root directory. Default ".". -#' @param user_bacs Character vector. Mixed inputs of taxon IDs and/or species names. -#' @param overwrite Logical. If FALSE and the DuckDB already exists for this selection, abort. Default: FALSE. -#' @param verbose Logical. +#' Resolves user-provided taxa to taxon IDs using the local BV-BRC cache, then +#' returns distinct genome IDs (Good + WGS/Complete). Also writes a 'bac_data' +#' table to the per-selection DuckDB with a cache snapshot of core fields. #' -#' @return A numeric vector of distinct `genome.genome_id`, or NULL if none found. +#' @return A character vector of distinct `genome.genome_id`, or NULL if none found. .retrieveQueryIDs <- function(base_dir = ".", user_bacs, overwrite = FALSE, @@ -481,60 +640,68 @@ if (isTRUE(verbose)) message("Resolving input taxa.") bac_input_data <- .retrieveCustomQuery(base_dir = base_dir, user_bacs = user_bacs) - if (is.null(bac_input_data) || nrow(bac_input_data) == 0) { message("No valid input provided or no matches found.") return(NULL) } - # Resolve per-bug DB path - paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) + # Per-selection DB path + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) db_path <- paths$db_path - bac_data <- tibble::tibble() - taxon_ids <- unique(bac_input_data$genome.taxon_id) - - if (isTRUE(verbose)) message("Querying BV-BRC for ", length(taxon_ids), " taxon IDs.") - - for (i in seq_along(taxon_ids)) { - tax <- taxon_ids[i] - if (isTRUE(verbose)) message("Taxon ", i, "/", length(taxon_ids), ": ", tax) - - res <- .getGenomeIDs( - base_dir = base_dir, - query_type = "taxon_id", - query_value = as.character(tax), - user_bacs = user_bacs, - overwrite = TRUE, - verbose = verbose - ) - - con <- res$duckdbConnection - tbl <- res$table_name - each_bac_data <- tibble::as_tibble(DBI::dbReadTable(con, tbl)) - bac_data <- dplyr::bind_rows(bac_data, each_bac_data) - - # Close per-iteration connection to avoid open handles piling up - try(DBI::dbDisconnect(con, shutdown = TRUE), silent = TRUE) + # Got a cache? Use that, it's fast + cache_db <- file.path(base_dir, "data", "bvbrc", "bvbrcData.duckdb") + if (!file.exists(cache_db)) { + stop("BV-BRC cache not found at: ", cache_db, ". Run .updateBVBRCdata() first.") } + con_cache <- DBI::dbConnect(duckdb::duckdb(), dbdir = cache_db, read_only = TRUE) + on.exit(try(DBI::dbDisconnect(con_cache, shutdown = TRUE), silent = TRUE), add = TRUE) - if (nrow(bac_data) > 0) { - genome_ids <- bac_data |> - dplyr::distinct(`genome.genome_id`) |> - dplyr::pull(`genome.genome_id`) - genome_ids <- genome_ids[!is.na(genome_ids)] + taxon_ids <- unique(bac_input_data$genome.taxon_id) + if (isTRUE(verbose)) message("Querying cache for ", length(taxon_ids), " taxon IDs.") + + taxon_list_sql <- paste(DBI::dbQuoteString(con_cache, taxon_ids), collapse = ", ") + sql_ids <- sprintf( + "SELECT DISTINCT + \"genome.genome_id\" AS gid, + \"genome.genome_name\" AS genome_name, + \"genome.taxon_id\" AS taxon_id, + \"genome.species\" AS species, + \"genome.strain\" AS strain + FROM bvbrc_bac_data + WHERE \"genome.taxon_id\" IN (%s) + AND \"genome.genome_quality\" = 'Good' + AND \"genome.genome_status\" IN ('WGS','Complete')", + taxon_list_sql + ) + cache_rows <- DBI::dbGetQuery(con_cache, sql_ids) + valid_id_re <- "^[0-9]+\\.[0-9]+$" + cache_rows <- cache_rows[grepl(valid_id_re, cache_rows$gid), , drop = FALSE] - if (length(genome_ids) > 0) { - if (isTRUE(verbose)) message("Collected ", length(genome_ids), " distinct genome IDs.") - return(genome_ids) - } else { - message("No valid genome IDs found.") - return(NULL) - } - } else { - message("No valid genome data found for any input.") + if (nrow(cache_rows) == 0L) { + message("No valid genome IDs found.") return(NULL) } + + genome_ids <- unique(cache_rows$gid) + if (isTRUE(verbose)) message("Collected ", length(genome_ids), " distinct genome IDs (cache).") + + # Write 'bac_data' from cache + con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path) + on.exit(try(DBI::dbDisconnect(con, shutdown = TRUE), silent = TRUE), add = TRUE) + + bac_data_tbl <- data.frame( + `genome.genome_id` = cache_rows$gid, + `genome.genome_name` = cache_rows$genome_name, + `genome.taxon_id` = cache_rows$taxon_id, + `genome.species` = cache_rows$species, + `genome.strain` = cache_rows$strain, + stringsAsFactors = FALSE + ) + DBI::dbWriteTable(con, "bac_data", bac_data_tbl, overwrite = TRUE) + if (isTRUE(verbose)) message("Wrote table 'bac_data' to: ", db_path) + + genome_ids } #' Extract AMR Data Table @@ -685,7 +852,7 @@ writeLines(genome_ids_output, con = tmp_in) tmp_in_mounted <- file.path("/data", "tmp", basename(tmp_in)) - # Choose attributes (AMR for this pipeline) + # Choose attributes (AMR for this workflow) chosen_fields <- if (identical(filter_type, "AMR")) amr_fields else microtrait_fields get_args <- c( @@ -707,7 +874,6 @@ return(genome_data) } - #' Retrieve AMR or Microtrait metadata from BV-BRC and store in DuckDB #' #' Queries BV-BRC for AMR or microtrait metadata for genomes corresponding to user inputs. @@ -716,7 +882,7 @@ #' Tables written: #' - amr_phenotype #' - genome_data -#' - metadata (inner join on genome ID field names as returned by BV-BRC) +#' - metadata (join on genome IDs returned by BV-BRC) #' #' @param user_bacs Character vector. Mixed taxon IDs and/or species strings (used for naming). #' @param filter_type Character. "AMR" or "microTraits". Default "AMR". @@ -748,7 +914,7 @@ retrieveMetadata <- function(user_bacs, return(NULL) } - # Fields (unchanged BV-BRC names) + # Desired fields from BV-BRC drug_fields <- paste0( "antibiotic,computational_method,", "evidence,genome_name,id,", @@ -798,7 +964,7 @@ retrieveMetadata <- function(user_bacs, "trna" ) - # Batching + # Batching downloads and parallel implementation batch_size <- 500L genome_ids <- as.character(genome_ids) genome_batches <- split(genome_ids, ceiling(seq_along(genome_ids) / batch_size)) @@ -818,10 +984,10 @@ retrieveMetadata <- function(user_bacs, envir = environment() ) parallel::clusterEvalQ(cluster, { - library(tibble) - library(dplyr) + library(tibble); library(dplyr) }) + # Pull AMR metadata if (isTRUE(verbose)) message("Retrieving AMR phenotype data in batches.") batch_drug_data <- parallel::parLapply(cluster, genome_batches, function(batch) { .extractAMRtable( @@ -838,17 +1004,15 @@ retrieveMetadata <- function(user_bacs, message("No drug data returned.") return(NULL) } - - combined_drug_data_tbl <- tibble::as_tibble( - utils::read.table( - text = combined_drug_data, - sep = "\t", header = TRUE, fill = TRUE, - quote = "", check.names = FALSE, comment.char = "" - ) - ) |> + combined_drug_data_tbl <- tibble::as_tibble(utils::read.table( + text = combined_drug_data, + sep = "\t", header = TRUE, fill = TRUE, + quote = "", check.names = FALSE, comment.char = "", colClasses = "character" + )) |> dplyr::mutate(dplyr::across(dplyr::everything(), ~ iconv(.x, from = "", to = "UTF-8", sub = ""))) |> dplyr::mutate(`genome_drug.genome_id` = as.character(`genome_drug.genome_id`)) + # Pull genome metadata if (isTRUE(verbose)) message("Retrieving genome metadata in batches.") batch_genome_data <- parallel::parLapply(cluster, genome_batches, function(batch) { .extractGenomeData( @@ -866,49 +1030,81 @@ retrieveMetadata <- function(user_bacs, message("No genome data returned.") return(NULL) } - - combined_genome_data_tbl <- tibble::as_tibble( - utils::read.table( - text = combined_genome_data, - sep = "\t", header = TRUE, fill = TRUE, - quote = "", check.names = FALSE, comment.char = "" - ) - ) |> + combined_genome_data_tbl <- tibble::as_tibble(utils::read.table( + text = combined_genome_data, + sep = "\t", header = TRUE, fill = TRUE, + quote = "", check.names = FALSE, comment.char = "", colClasses = "character" + )) |> dplyr::mutate(dplyr::across(dplyr::everything(), ~ iconv(.x, from = "", to = "UTF-8", sub = ""))) |> dplyr::mutate(`genome.genome_id` = as.character(`genome.genome_id`)) - # Per-bug DB path - paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) + # Write & join + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) db_path <- paths$db_path - logs_dir <- file.path(base_dir, "data", "logs") dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE) cat(sprintf("[%s] Writing metadata DuckDB: %s\n", Sys.time(), db_path), - file = file.path(logs_dir, "bvbrc.log"), append = TRUE - ) + file = file.path(logs_dir, "bvbrc.log"), append = TRUE) con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path) + on.exit(try(DBI::dbDisconnect(con, shutdown = TRUE), silent = TRUE), add = TRUE) + DBI::dbWriteTable(con, "amr_phenotype", combined_drug_data_tbl, overwrite = TRUE) - DBI::dbWriteTable(con, "genome_data", combined_genome_data_tbl, overwrite = TRUE) + DBI::dbWriteTable(con, "genome_data", combined_genome_data_tbl, overwrite = TRUE) if (isTRUE(verbose)) message("Joining AMR phenotype and genome metadata.") - initial_metadata <- tibble::as_tibble(DBI::dbGetQuery( + DBI::dbExecute(con, ' + CREATE OR REPLACE TABLE metadata AS + SELECT * + FROM amr_phenotype + INNER JOIN genome_data + ON amr_phenotype."genome_drug.genome_id" = genome_data."genome.genome_id" + ') + + # Debug summary after writes + n_targets <- length(genome_ids) + n_amr_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n + n_gmeta_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM genome_data')$n + if (isTRUE(verbose)) { + message("Initial summary: targets=", n_targets, + " | AMR genomes=", n_amr_ids, + " | genome_data genomes=", n_gmeta_ids) + } + + # Tagged view + .create_amr_tagged_view(con) + + # Final debug summary + n_bac <- if ("bac_data" %in% DBI::dbListTables(con)) + DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM bac_data')$n else NA_integer_ + n_amr_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n + n_gmeta_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM genome_data')$n + + ids_zero_amr <- DBI::dbGetQuery( con, - 'SELECT * - FROM amr_phenotype - INNER JOIN genome_data - ON amr_phenotype."genome_drug.genome_id" = genome_data."genome.genome_id"' - )) - DBI::dbWriteTable(con, "metadata", initial_metadata, overwrite = TRUE) + 'WITH sel AS (SELECT DISTINCT "genome.genome_id" AS gid FROM genome_data), + got AS (SELECT DISTINCT "genome_drug.genome_id" AS gid FROM amr_phenotype) + SELECT sel.gid + FROM sel LEFT JOIN got USING (gid) + WHERE got.gid IS NULL + ORDER BY sel.gid' + )$gid if (isTRUE(verbose)) { - message("Wrote tables 'amr_phenotype', 'genome_data', and 'metadata' to: ", db_path) + message("Final summary:") + message(" targets : ", length(genome_ids)) + message(" bac_data : ", n_bac) + message(" genome_data : ", n_gmeta_ids) + message(" amr_phenotype: ", n_amr_ids) + message(" genomes with 0 AMR rows: ", length(ids_zero_amr)) + if (length(ids_zero_amr)) { + message(" e.g.: ", paste(utils::head(ids_zero_amr, 10), collapse = ", ")) + } } list(duckdbConnection = con, table_name = "metadata") } - # FASTA sanitizer to ensure Panaroo compatibility with BV-BRC CLI downloads .strip_fasta_preamble <- function(fna_path) { if (!file.exists(fna_path)) { @@ -966,37 +1162,70 @@ retrieveMetadata <- function(user_bacs, #' derive genome IDs from user_bacs (taxon IDs or species substring), and #' write a minimal "filtered" table (without AMR evidence filtering). #' +#' @param evidence_mode Character. Either "lab_only" (default), "lab_or_comp" (all), +#' "comp_only" (BV-BRC-predicted without lab labels), or "any" (no AMR data required). +#' @return A list with a DuckDB connection and table_name = "filtered" +#' Filter genomes by AMR phenotype and metadata, and store results in DuckDB +#' +#' Preferred path: use per-selection DB "metadata" table (from retrieveMetadata()) and +#' apply evidence & genome_quality filters. +#' +#' Fallback path: if "metadata" is missing and fallback_to_bvbrc_cache = TRUE, +#' read BV-BRC cache at /data/bvbrc/bvbrcData.duckdb ("bvbrc_bac_data"), +#' derive genome IDs from user_bacs (taxon IDs or species substring), and +#' write a minimal "filtered" table (without AMR evidence filtering). +#' +#' @param evidence_mode Character. One of: +#' "lab_only" (default) -> only laboratory evidence +#' "lab_or_comp" -> laboratory OR computational evidence +#' "comp_only" -> only computational evidence +#' "any" -> no AMR required (from genome_data; Good only) #' @return A list with a DuckDB connection and table_name = "filtered" .filterGenomes <- function(user_bacs, base_dir = ".", + evidence_mode = c("lab_only","lab_or_comp","comp_only","any"), verbose = TRUE, fallback_to_bvbrc_cache = TRUE) { + evidence_mode <- match.arg(evidence_mode) base_dir <- normalizePath(base_dir, mustWork = FALSE) - paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) - db_path <- paths$db_path + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) + db_path <- paths$db_path con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path) + on.exit({ NULL }, add = TRUE) # keep open for caller - on.exit( - { - # Keep connection open for caller - NULL - }, - add = TRUE - ) - - # Good metadata present? Apply AMR filters + # The convenient "Metadata Exists" path if ("metadata" %in% DBI::dbListTables(con)) { if (isTRUE(verbose)) message("Loading metadata for filtering.") - initial_metadata <- DBI::dbReadTable(con, "metadata") - if (is.null(initial_metadata) || nrow(initial_metadata) == 0) { + + # If "any", AMR data not needed, just fetch qualifying genome_data + if (evidence_mode == "any") { + gd <- DBI::dbReadTable(con, "genome_data") + if (is.null(gd) || nrow(gd) == 0) { + DBI::dbDisconnect(con, shutdown = TRUE) + stop("No data available in 'genome_data'.") + } + gd <- tibble::as_tibble(gd) |> + dplyr::filter(`genome.genome_quality` == "Good") |> + dplyr::distinct(`genome.genome_id`) + # Keep BV-BRC column name + DBI::dbWriteTable(con, "filtered", gd, overwrite = TRUE) + if (isTRUE(verbose)) { + message("Post-filter distinct genomes (any): ", nrow(gd)) + message("Wrote table 'filtered' to: ", db_path) + } + return(list(duckdbConnection = con, table_name = "filtered")) + } + + # Otherwise, open up the metadata and start interrogating the AMR data + md <- DBI::dbReadTable(con, "metadata") + if (is.null(md) || nrow(md) == 0) { DBI::dbDisconnect(con, shutdown = TRUE) message("No data available in 'metadata'.") return(NULL) } - # Normalize evidence labels - initial_metadata <- tibble::as_tibble(initial_metadata) |> + md <- tibble::as_tibble(md) |> dplyr::mutate( `genome_drug.evidence` = dplyr::case_when( `genome_drug.laboratory_typing_method` %in% @@ -1006,29 +1235,39 @@ retrieveMetadata <- function(user_bacs, ) ) - # AMR and quality filtering - filtered_metadata <- initial_metadata |> - dplyr::filter(`genome_drug.evidence` == "Laboratory Method") |> + if (evidence_mode == "lab_only") { + md <- dplyr::filter(md, `genome_drug.evidence` == "Laboratory Method") + } else if (evidence_mode == "comp_only") { + md <- dplyr::filter( + md, + `genome_drug.evidence` == "Computational Method" | + (!is.na(`genome_drug.computational_method`) & nzchar(`genome_drug.computational_method`)) | + (`genome_drug.laboratory_typing_method` == "Computational Prediction") + ) + } else { + # Lab_or_comp: Doesn't matter, keep any predictions + md <- md + } + + md <- md |> dplyr::filter(`genome.genome_quality` == "Good") |> - dplyr::filter(`genome_drug.resistant_phenotype` %in% c("Resistant", "Susceptible", "Intermediate")) + dplyr::filter(`genome_drug.resistant_phenotype` %in% c("Resistant","Susceptible","Intermediate")) |> + dplyr::distinct(`genome.genome_id`) # keep BV-BRC column name - DBI::dbWriteTable(con, "filtered", filtered_metadata, overwrite = TRUE) + DBI::dbWriteTable(con, "filtered", md, overwrite = TRUE) if (isTRUE(verbose)) { - message("Post-filter rows: ", nrow(filtered_metadata)) + message("Post-filter distinct genomes: ", nrow(md)) message("Wrote table 'filtered' to: ", db_path) } return(list(duckdbConnection = con, table_name = "filtered")) } - # Attempt BV-BRC cache location + # No metadata fallback if (!isTRUE(fallback_to_bvbrc_cache)) { DBI::dbDisconnect(con, shutdown = TRUE) stop("No 'metadata' table found in ", db_path, ". Run retrieveMetadata() first.") } - - if (isTRUE(verbose)) { - message("No 'metadata' in per-selection DB. Falling back to BV-BRC cache at data/bvbrc/.") - } + if (isTRUE(verbose)) message("No 'metadata' in per-selection DB. Falling back to BV-BRC cache at data/bvbrc/.") cache_db <- file.path(base_dir, "data", "bvbrc", "bvbrcData.duckdb") if (!file.exists(cache_db)) { @@ -1036,7 +1275,7 @@ retrieveMetadata <- function(user_bacs, stop("BV-BRC cache not found at: ", cache_db, ". Run .updateBVBRCdata() first.") } - con_cache <- DBI::dbConnect(duckdb::duckdb(), dbdir = cache_db) + con_cache <- DBI::dbConnect(duckdb::duckdb(), dbdir = cache_db, read_only = TRUE) on.exit(try(DBI::dbDisconnect(con_cache, shutdown = TRUE), silent = TRUE), add = TRUE) if (!"bvbrc_bac_data" %in% DBI::dbListTables(con_cache)) { @@ -1044,32 +1283,22 @@ retrieveMetadata <- function(user_bacs, stop("Table 'bvbrc_bac_data' not found in BV-BRC cache: ", cache_db) } - bv <- tibble::as_tibble(DBI::dbReadTable(con_cache, "bvbrc_bac_data")) - - - # Derive matches from user_bacs (taxon IDs or species) + bv <- tibble::as_tibble(DBI::dbReadTable(con_cache, "bvbrc_bac_data")) sel <- tibble::tibble(`genome.genome_id` = character()) for (v in user_bacs) { - if (suppressWarnings(!is.na(as.numeric(v)))) { - # numeric taxon_id match - matches <- bv[bv$`genome.taxon_id` == v | - bv$`genome.taxon_id` == as.numeric(v), , drop = FALSE] + if (.id_checker(v)) { + v_chr <- as.character(v) + matches <- bv[bv$`genome.taxon_id` == v_chr, , drop = FALSE] } else { - # species substring (case-insensitive) matches <- bv[stringr::str_detect( bv$`genome.species`, stringr::fixed(v, ignore_case = TRUE) ), , drop = FALSE] } - if (nrow(matches)) { - sel <- dplyr::bind_rows( - sel, - tibble::tibble( - `genome.genome_id` = as.character(matches$`genome.genome_id`) - ) - ) + sel <- dplyr::bind_rows(sel, + tibble::tibble(`genome.genome_id` = as.character(matches$`genome.genome_id`))) } } sel <- dplyr::distinct(sel) @@ -1079,37 +1308,14 @@ retrieveMetadata <- function(user_bacs, stop("No genomes matched user_bacs in BV-BRC cache.") } - # Minimal 'filtered' for downstream steps (downloaders & genomeList) - minimal_filtered <- sel - # Ensure expected column name used downstream: - # retrieveGenomes() reads "filtered" and expects `genome.genome_id` to exist. - DBI::dbWriteTable(con, "filtered", minimal_filtered, overwrite = TRUE) - - if (isTRUE(verbose)) { - message("Wrote table 'filtered' to: ", db_path) - } + DBI::dbWriteTable(con, "filtered", sel, overwrite = TRUE) + if (isTRUE(verbose)) message("Wrote table 'filtered' to: ", db_path) list(duckdbConnection = con, table_name = "filtered") } - -# BV-BRC downloader block -# Normalize Docker path -.docker_path <- function(p) gsub("\\\\", "/", normalizePath(p, mustWork = FALSE)) - -# Prefer bash -.pick_shell <- function(image) { - chk <- suppressWarnings(system2("docker", - c( - "run", "--rm", image, "sh", "-lc", - "command -v bash >/dev/null || echo NOBASH" - ), - stdout = TRUE, stderr = TRUE - )) - if (length(chk) && any(grepl("NOBASH", chk))) "sh" else "bash" -} - -# Check if a complete set (.fna + .PATRIC.faa + .PATRIC.gff) exists after DL +#' Helps check if a complete set exists after DL (.fna + .PATRIC.faa + .PATRIC.gff) +#' @keywords internal .is_complete_set <- function(dir, genomeID, min_bytes = 100) { fna <- file.path(dir, paste0(genomeID, ".fna")) faa <- file.path(dir, paste0(genomeID, ".PATRIC.faa")) @@ -1119,17 +1325,14 @@ retrieveMetadata <- function(user_bacs, all(vapply(paths, function(x) file.info(x)$size, numeric(1)) > min_bytes) } -# List genomes already completed +#' Helps collate completed genomes into a set +#' @keywords internal .list_complete <- function(dir, genome_ids, min_bytes = 100) { genome_ids[vapply(genome_ids, .is_complete_set, logical(1), dir = dir, min_bytes = min_bytes)] } -# Any genomes missing bits or pieces? Find em -.missing_any <- function(dir, genome_ids, min_bytes = 100) { - genome_ids[!vapply(genome_ids, .is_complete_set, logical(1), dir = dir, min_bytes = min_bytes)] -} - -# Audit function +#' Helps in auditing downloaded files to ensure everything's complete per ID +#' @keywords internal .audit_gaps <- function(out_dir, ids, min_bytes = 100) { out_dir <- normalizePath(out_dir, mustWork = FALSE) df <- data.frame( @@ -1148,68 +1351,27 @@ retrieveMetadata <- function(user_bacs, df } -# FTPS preferred! HTTPS as a fallback to see if it's just an FTPS issue -.ftp_download_one <- function(genomeID, out_dir, tries = 3L, min_bytes = 100) { - files <- c(".fna", ".PATRIC.faa", ".PATRIC.gff") - dests <- file.path(out_dir, paste0(genomeID, files)) - if (.is_complete_set(out_dir, genomeID, min_bytes)) { - return(TRUE) - } - - get_one <- function(url, dest) { - if (nzchar(Sys.which("wget"))) { - res <- suppressWarnings(system2("wget", - c("-q", "-O", shQuote(dest), shQuote(url)), - stdout = TRUE, stderr = TRUE - )) - st <- attr(res, "status") - if (is.null(st)) st <- 0L - return(st == 0L && file.exists(dest) && file.info(dest)$size > min_bytes) - } else if (nzchar(Sys.which("curl"))) { - curl_args <- if (startsWith(url, "ftps://")) { - c("--ssl-reqd", "-L", "-o", shQuote(dest), shQuote(url)) - } else { - c("-L", "-o", shQuote(dest), shQuote(url)) - } - res <- suppressWarnings(system2("curl", curl_args, stdout = TRUE, stderr = TRUE)) - st <- attr(res, "status") - if (is.null(st)) st <- 0L - return(st == 0L && file.exists(dest) && file.info(dest)$size > min_bytes) - } - FALSE - } +### BV-BRC CLI downloader [slower by comparison, but does not need FTP server] - for (ext_i in seq_along(files)) { - dest <- dests[ext_i] - if (file.exists(dest) && file.info(dest)$size > min_bytes) next - ftps <- sprintf("ftps://ftp.bv-brc.org/genomes/%s/%s%s", genomeID, genomeID, files[ext_i]) - https <- sprintf("https://ftp.bv-brc.org/genomes/%s/%s%s", genomeID, genomeID, files[ext_i]) - - ok <- FALSE - for (k in 1:tries) { - if (get_one(ftps, dest)) { - ok <- TRUE - break - } - } - if (!ok) { - for (k in 1:2) { - if (get_one(https, dest)) { - ok <- TRUE - break - } - } - } - if (!ok) { - return(FALSE) - } - } - .is_complete_set(out_dir, genomeID, min_bytes) -} +#' Helps normalize Docker paths +#' @keywords internal +.docker_path <- function(p) gsub("\\\\", "/", normalizePath(p, mustWork = FALSE)) -# BV-BRC CLI downloader [very slow by comparison, but does not need FTP server] +#' Helps run a shell inside a container, and prefers bash (don't we all?) +#' @keywords internal +.pick_shell <- function(image) { + chk <- suppressWarnings(system2("docker", + c( + "run", "--rm", image, "sh", "-lc", + "command -v bash >/dev/null || echo NOBASH" + ), + stdout = TRUE, stderr = TRUE + )) + if (length(chk) && any(grepl("NOBASH", chk))) "sh" else "bash" +} -# p3-dump-genomes to fetch FASTA and .gto files +#' Using p3-dump-genomes in CLI to fetch FASTA and .gto files +#' @keywords internal .cli_dump_fastas_gto_chunk <- function(image, out_dir, genome_ids, tag, tries = 3L) { out_dir <- normalizePath(out_dir, mustWork = FALSE) dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) @@ -1254,7 +1416,8 @@ retrieveMetadata <- function(user_bacs, st == 0L } -# Export GFF from GTO per genomes in each chunk +#' Exports GFF files from the downloaded GTO per genome in each chunk +#' @keywords internal .cli_export_gff_chunk <- function(image, out_dir, genome_ids, tag, tries = 3L) { out_dir <- normalizePath(out_dir, mustWork = FALSE) ids_file <- file.path(out_dir, paste0("ids_", tag, ".txt")) @@ -1344,36 +1507,45 @@ retrieveGenomes <- function(base_dir = ".", method = c("ftp", "cli"), image = "danylmb/bvbrc:5.3", skip_existing = TRUE, - ftp_workers = 8L, + ftp_workers = 4L, cli_fasta_workers = 4L, cli_gff_workers = 4L, chunk_size = 50L, + evidence_mode = c("lab_only","lab_or_comp","comp_only","any"), # NEW verbose = TRUE) { method <- match.arg(method) + evidence_mode <- match.arg(evidence_mode) base_dir <- normalizePath(base_dir, mustWork = FALSE) - # IDs from .filterGenomes() - if (isTRUE(verbose)) message("Filtering genomes before download.") - f_out <- .filterGenomes(base_dir = base_dir, user_bacs = user_bacs, verbose = verbose) - if (is.null(f_out)) { - return(character()) + # Use 'filtered' if already prepared, or start filtering + if (isTRUE(verbose)) message("Preparing download set (checking for existing 'filtered').") + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) + db_path <- paths$db_path + con0 <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path) + has_filtered <- "filtered" %in% DBI::dbListTables(con0) + if (has_filtered) { + if (isTRUE(verbose)) message("Using existing 'filtered' table (skipping re-filter).") + con <- con0; tbl <- "filtered" + } else { + if (isTRUE(verbose)) message("No 'filtered' table found; filtering now.") + f_out <- .filterGenomes(base_dir = base_dir, + user_bacs = user_bacs, + evidence_mode = evidence_mode, + verbose = verbose) + con <- f_out$duckdbConnection + tbl <- f_out$table_name } - con <- f_out$duckdbConnection - tbl <- f_out$table_name ids <- tibble::as_tibble(DBI::dbReadTable(con, tbl)) |> dplyr::distinct(`genome.genome_id`) |> dplyr::pull(`genome.genome_id`) - # Set up the paths and such - paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) - bug_dir <- dirname(paths$db_path) + bug_dir <- dirname(db_path) genome_path <- file.path(bug_dir, "genomes") logs_dir <- file.path(base_dir, "data", "logs") dir.create(genome_path, recursive = TRUE, showWarnings = FALSE) dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE) - # Adding support for resuming a download if (isTRUE(skip_existing)) { already <- .list_complete(genome_path, ids) if (isTRUE(verbose)) message(length(already), " genomes already completed; skipping.") @@ -1391,60 +1563,59 @@ retrieveGenomes <- function(base_dir = ".", return(all_complete) } - # FTP method -- preferred if server is available if (identical(method, "ftp")) { - if (isTRUE(verbose)) message("Trying FTPS download. Workers=", ftp_workers) - future::plan(future::multisession, workers = max(1, ftp_workers)) - ft_ok <- future.apply::future_lapply(ids, function(gid) .ftp_download_one(gid, genome_path), - future.seed = TRUE - ) - ok_ids <- ids[unlist(ft_ok)] - if (isTRUE(verbose)) message("Complete file sets for ", length(ok_ids), " genomes (FTP).") - return(c(ok_ids, .list_complete(genome_path, setdiff(ids, ok_ids)))) - } - - # CLI for FASTA, FAA, and GTO, then GFF from GTO - chunks <- split(ids, ceiling(seq_along(ids) / chunk_size)) - - # Parallel chunk containers - if (isTRUE(verbose)) { - message( - "CLI being run in parallel for ", length(chunks), - " data chunks." + if (isTRUE(verbose)) message("Downloading by FTPS.") + ok_ids <- .ftpes_download_two_pass( + genome_ids = ids, + out_dir = genome_path, + workers_first = ftp_workers, + workers_second = ftp_workers, + log_file = file.path(logs_dir, "ftp_download.log") ) + if (isTRUE(verbose)) { + message("Complete file sets for ", length(ok_ids), " genomes total.") + miss <- setdiff(ids, ok_ids) + if (length(miss)) message("Excluded (failed after retry): ", length(miss)) + } + return(ok_ids) } - future::plan(future::multisession, workers = max(1, cli_fasta_workers)) - fa_res <- future.apply::future_mapply( - FUN = function(vec, tag) .cli_dump_fastas_gto_chunk(image, genome_path, vec, tag), - vec = chunks, tag = paste0("fa", seq_along(chunks)), - SIMPLIFY = TRUE, future.seed = TRUE - ) - if (!all(fa_res) && isTRUE(verbose)) warning(sum(!fa_res), " data chunks failed.") - # GFF extraction in parallel containers - if (isTRUE(verbose)) { - message( - "GFF extraction being run in parallel for ", - length(chunks), " data chunks." - ) - } - future::plan(future::multisession, workers = max(1, cli_gff_workers)) - g_res <- future.apply::future_mapply( - FUN = function(vec, tag) .cli_export_gff_chunk(image, genome_path, vec, tag), - vec = chunks, tag = paste0("gff", seq_along(chunks)), - SIMPLIFY = TRUE, future.seed = TRUE - ) - if (!all(g_res) && isTRUE(verbose)) warning(sum(!g_res), " GFF chunks had failures.") + if (identical(method, "cli")) { + if (isTRUE(verbose)) { + message( + "CLI mode: downloading genomes in parallelized chunks. ", + "Chunks=", ceiling(length(ids)/chunk_size), + " | fasta_workers=", cli_fasta_workers, + " | gff_workers=", cli_gff_workers + ) + } - # Success set: .fna + .PATRIC.faa + .PATRIC.gff all present per isolate - ok_ids <- ids[vapply(ids, .is_complete_set, logical(1), dir = genome_path)] - if (isTRUE(verbose)) { - message( - "Complete file sets downloaded for ", - length(ok_ids), " genomes." - ) + chunks <- split(ids, ceiling(seq_along(ids) / chunk_size)) + + # FASTA + GTO in parallel containers + future::plan(future::multisession, workers = max(1, cli_fasta_workers)) + invisible(future.apply::future_mapply( + FUN = function(vec, tag) .cli_dump_fastas_gto_chunk(image, genome_path, vec, tag), + vec = chunks, tag = paste0("fa", seq_along(chunks)), + SIMPLIFY = TRUE, future.seed = TRUE + )) + + # GFF export in parallel containers + future::plan(future::multisession, workers = max(1, cli_gff_workers)) + invisible(future.apply::future_mapply( + FUN = function(vec, tag) .cli_export_gff_chunk(image, genome_path, vec, tag), + vec = chunks, tag = paste0("gff", seq_along(chunks)), + SIMPLIFY = TRUE, future.seed = TRUE + )) + + ok_ids <- ids[vapply(ids, .is_complete_set, logical(1), dir = genome_path)] + if (isTRUE(verbose)) { + message("CLI complete file sets for ", length(ok_ids), " genomes.") + miss <- setdiff(ids, ok_ids) + if (length(miss)) message("Excluded (CLI completion failed): ", length(miss)) + } + return(ok_ids) } - ok_ids } #' Build a table of local genome file paths and write to DuckDB @@ -1494,7 +1665,7 @@ genomeList <- function(base_dir = ".", faa_path = if (file.exists(faa_path) && file.info(faa_path)$size > 100) faa_path else NA, panaroo_input = if ( file.exists(gff_path) && file.exists(fna_path) && file.exists(faa_path) && - file.info(gff_path)$size > 100 && file.info(fna_path)$size > 100 && file.info(faa_path)$size > 100 + file.info(gff_path)$size > 100 && file.info(fna_path)$size > 100 && file.info(faa_path)$size > 100 ) { paste(gff_path, fna_path) } else { @@ -1546,6 +1717,9 @@ genomeList <- function(base_dir = ".", #' `"ftp"` (default) or `"cli"`. #' @param overwrite Logical. Passed to metadata filtering and DuckDB creation. #' Default FALSE. +#' @param evidence_mode Character. Sets what types of AMR evidence is acceptable. +#' Default `lab_only`. `any` will not require AMR data for downloads. This will +#' return very large download lists for many species! #' @param verbose Logical. Print progress messages. Default TRUE. #' #' @return A list (the output of `genomeList()`), containing: @@ -1557,32 +1731,64 @@ prepareGenomes <- function(user_bacs, base_dir = ".", method = c("ftp", "cli"), overwrite = FALSE, + evidence_mode = c("lab_only","lab_or_comp","comp_only","any"), verbose = TRUE) { method <- match.arg(method) + evidence_mode <- match.arg(evidence_mode) base_dir <- normalizePath(base_dir, mustWork = FALSE) - # Ensure the BV-BRC metadata cache exists, fetch if missing .ensure_bvbrc_cache(base_dir = base_dir, verbose = verbose) - if (isTRUE(verbose)) message("Step 1: Downloading genomes from BV-BRC") + if (isTRUE(verbose)) message("Step 0: Building AMR metadata (retrieveMetadata)") + invisible(retrieveMetadata( + user_bacs = user_bacs, + filter_type = "AMR", + base_dir = base_dir, + abx = "All", + overwrite = overwrite, + verbose = verbose + )) + + if (isTRUE(verbose)) message("Step 1: Filtering genomes for download by evidence: ", evidence_mode) + f_out <- .filterGenomes( + base_dir = base_dir, + user_bacs = user_bacs, + evidence_mode = evidence_mode, + verbose = verbose, + fallback_to_bvbrc_cache = FALSE + ) + if (is.null(f_out)) { + message("No genomes available after evidence filtering.") + return(NULL) + } - # Filter + download + # A little summary of what's left after filtering (or not) + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) + con <- DBI::dbConnect(duckdb::duckdb(), dbdir = paths$db_path, read_only = TRUE) + n_filtered <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM filtered')$n + n_meta <- if ("genome_data" %in% DBI::dbListTables(con)) DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM genome_data')$n else NA + n_amr <- if ("amr_phenotype" %in% DBI::dbListTables(con)) DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n else NA + DBI::dbDisconnect(con, shutdown = TRUE) + if (isTRUE(verbose)) { + message(sprintf("Evidence filter summary: filtered=%d | genomes with AMR=%s | genomes with genome_data=%s", + n_filtered, ifelse(is.na(n_amr), "NA", n_amr), ifelse(is.na(n_meta), "NA", n_meta))) + } + + if (isTRUE(verbose)) message("Step 2: Downloading genomes from BV-BRC (", method, ")") ids <- retrieveGenomes( base_dir = base_dir, user_bacs = user_bacs, method = method, skip_existing = !overwrite, + evidence_mode = evidence_mode, verbose = verbose ) - if (length(ids) == 0L) { - message("No genomes available after filtering.") + message("No genomes downloaded.") return(NULL) } - if (isTRUE(verbose)) message("Step 2: Formatting data into a database for further processing") - - # List local files into per-selection DB + if (isTRUE(verbose)) message("Step 3: Formatting data into a database for further processing") out <- genomeList( base_dir = base_dir, user_bacs = user_bacs, @@ -1590,7 +1796,7 @@ prepareGenomes <- function(user_bacs, ) if (isTRUE(verbose)) { - message("Done. Files are ready! Please continue to downstream processing by `runDataProcessing()`") + message("Done. Files are ready! Continue with downstream processing with runDataProcessing().") } out } From 5287d1bc90488555be025cf7a28f96933a447fab Mon Sep 17 00:00:00 2001 From: epbrenner Date: Wed, 25 Feb 2026 22:17:03 +0000 Subject: [PATCH 4/4] Style code (GHA) --- R/data_curation.R | 182 +++++++++++++++++++++++++++------------------- 1 file changed, 106 insertions(+), 76 deletions(-) diff --git a/R/data_curation.R b/R/data_curation.R index ac5095b..3cad422 100644 --- a/R/data_curation.R +++ b/R/data_curation.R @@ -8,7 +8,7 @@ #' Helps tag genomes with their AMR evidence for parsing #' @keywords internal .create_amr_tagged_view <- function(con) { - lab_methods <- c("Disk diffusion", "MIC", "Broth dilution", "Agar dilution") + lab_methods <- c("Disk diffusion", "MIC", "Broth dilution", "Agar dilution") lab_list_sql <- paste(DBI::dbQuoteString(con, lab_methods), collapse = ", ") comp_str_sql <- DBI::dbQuoteString(con, "Computational Method") @@ -35,16 +35,15 @@ #' @keywords internal .ftpes_download_one <- function(genomeID, out_dir, connect_timeout = 10L, - max_time = 30L, - speed_time = 30L, # end if speed_time - speed_limit = 2048L, # B/s - min_bytes = 100L) { - + max_time = 30L, + speed_time = 30L, # end if speed_time + speed_limit = 2048L, # B/s + min_bytes = 100L) { dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) exts <- c(".fna", ".PATRIC.faa", ".PATRIC.gff") for (ext in exts) { - dest <- file.path(out_dir, paste0(genomeID, ext)) + dest <- file.path(out_dir, paste0(genomeID, ext)) dest_tmp <- paste0(dest, ".tmp") # Skip if we already completed these @@ -57,17 +56,18 @@ args <- c( "--fail", "--silent", "--show-error", "-L", "--connect-timeout", as.character(connect_timeout), - "--max-time", as.character(max_time), - "--speed-time", as.character(speed_time), - "--speed-limit", as.character(speed_limit), - "--ftp-ssl", # AUTH TLS on port 21 works in testing + "--max-time", as.character(max_time), + "--speed-time", as.character(speed_time), + "--speed-limit", as.character(speed_limit), + "--ftp-ssl", # AUTH TLS on port 21 works in testing "--ftp-pasv", "--disable-epsv", "--ipv4", "--user", "anonymous:", "-o", shQuote(dest_tmp), shQuote(url) ) res <- suppressWarnings(system2("curl", args = args, stdout = TRUE, stderr = TRUE)) - status <- attr(res, "status"); if (is.null(status)) status <- 0L + status <- attr(res, "status") + if (is.null(status)) status <- 0L # Avoiding 0B files cluttering up results -- atomic rename on successful tmp DL ok <- (status == 0L && file.exists(dest_tmp) && file.info(dest_tmp)$size > min_bytes) @@ -79,7 +79,7 @@ } } else { try(unlink(dest_tmp), silent = TRUE) - return(FALSE) # Abort early, don't accept partial set + return(FALSE) # Abort early, don't accept partial set } } @@ -92,16 +92,19 @@ #' hiccup. If 2nd pass fails, log and give up on that file. #' @keywords internal .ftpes_download_two_pass <- function(genome_ids, out_dir, - workers_first = 4L, + workers_first = 4L, workers_second = 4L, - log_file = NULL) { + log_file = NULL) { genome_ids <- unique(as.character(genome_ids)) - if (!length(genome_ids)) return(character(0)) + if (!length(genome_ids)) { + return(character(0)) + } if (!is.null(log_file)) { dir.create(dirname(log_file), recursive = TRUE, showWarnings = FALSE) cat(sprintf("[%s] FTPS run start: %d genomes\n", Sys.time(), length(genome_ids)), - file = log_file, append = TRUE) + file = log_file, append = TRUE + ) } # Pass 1: 30s per-file cap @@ -111,8 +114,9 @@ genome_ids, function(gid) { ok <- .ftpes_download_one(gid, out_dir, - connect_timeout = 10L, max_time = 30L, - speed_time = 30L, speed_limit = 2048L) + connect_timeout = 10L, max_time = 30L, + speed_time = 30L, speed_limit = 2048L + ) list(gid = gid, ok = ok) }, future.seed = TRUE @@ -123,12 +127,16 @@ message(sprintf("Pass 1: ok=%d, fail=%d", length(ok_ids_1), length(fail_ids))) if (!is.null(log_file)) { cat(sprintf("[%s] Pass1 ok=%d fail=%d\n", Sys.time(), length(ok_ids_1), length(fail_ids)), - file = log_file, append = TRUE) + file = log_file, append = TRUE + ) } if (!length(fail_ids)) { - if (!is.null(log_file)) cat(sprintf("[%s] FTPS run end: all OK\n", Sys.time()), - file = log_file, append = TRUE) + if (!is.null(log_file)) { + cat(sprintf("[%s] FTPS run end: all OK\n", Sys.time()), + file = log_file, append = TRUE + ) + } return(ok_ids_1) } @@ -139,22 +147,25 @@ fail_ids, function(gid) { ok <- .ftpes_download_one(gid, out_dir, - connect_timeout = 10L, max_time = 60L, - speed_time = 30L, speed_limit = 2048L) + connect_timeout = 10L, max_time = 60L, + speed_time = 30L, speed_limit = 2048L + ) list(gid = gid, ok = ok) }, future.seed = TRUE ) ok2 <- vapply(res2, `[[`, logical(1), "ok") - ok_ids_2 <- fail_ids[ok2] + ok_ids_2 <- fail_ids[ok2] still_fail <- setdiff(fail_ids, ok_ids_2) message(sprintf("Pass 2: ok=%d, still_fail=%d", length(ok_ids_2), length(still_fail))) if (!is.null(log_file)) { cat(sprintf("[%s] Pass2 ok=%d still_fail=%d\n", Sys.time(), length(ok_ids_2), length(still_fail)), - file = log_file, append = TRUE) + file = log_file, append = TRUE + ) if (length(still_fail)) { cat("Fail IDs (excluded): ", paste(head(still_fail, 50), collapse = ", "), "\n", - file = log_file, append = TRUE) + file = log_file, append = TRUE + ) } } @@ -646,7 +657,7 @@ } # Per-selection DB path - paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) db_path <- paths$db_path # Got a cache? Use that, it's fast @@ -691,11 +702,11 @@ on.exit(try(DBI::dbDisconnect(con, shutdown = TRUE), silent = TRUE), add = TRUE) bac_data_tbl <- data.frame( - `genome.genome_id` = cache_rows$gid, + `genome.genome_id` = cache_rows$gid, `genome.genome_name` = cache_rows$genome_name, - `genome.taxon_id` = cache_rows$taxon_id, - `genome.species` = cache_rows$species, - `genome.strain` = cache_rows$strain, + `genome.taxon_id` = cache_rows$taxon_id, + `genome.species` = cache_rows$species, + `genome.strain` = cache_rows$strain, stringsAsFactors = FALSE ) DBI::dbWriteTable(con, "bac_data", bac_data_tbl, overwrite = TRUE) @@ -984,7 +995,8 @@ retrieveMetadata <- function(user_bacs, envir = environment() ) parallel::clusterEvalQ(cluster, { - library(tibble); library(dplyr) + library(tibble) + library(dplyr) }) # Pull AMR metadata @@ -1039,18 +1051,19 @@ retrieveMetadata <- function(user_bacs, dplyr::mutate(`genome.genome_id` = as.character(`genome.genome_id`)) # Write & join - paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) db_path <- paths$db_path logs_dir <- file.path(base_dir, "data", "logs") dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE) cat(sprintf("[%s] Writing metadata DuckDB: %s\n", Sys.time(), db_path), - file = file.path(logs_dir, "bvbrc.log"), append = TRUE) + file = file.path(logs_dir, "bvbrc.log"), append = TRUE + ) con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path) on.exit(try(DBI::dbDisconnect(con, shutdown = TRUE), silent = TRUE), add = TRUE) DBI::dbWriteTable(con, "amr_phenotype", combined_drug_data_tbl, overwrite = TRUE) - DBI::dbWriteTable(con, "genome_data", combined_genome_data_tbl, overwrite = TRUE) + DBI::dbWriteTable(con, "genome_data", combined_genome_data_tbl, overwrite = TRUE) if (isTRUE(verbose)) message("Joining AMR phenotype and genome metadata.") DBI::dbExecute(con, ' @@ -1062,22 +1075,27 @@ retrieveMetadata <- function(user_bacs, ') # Debug summary after writes - n_targets <- length(genome_ids) - n_amr_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n + n_targets <- length(genome_ids) + n_amr_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n n_gmeta_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM genome_data')$n if (isTRUE(verbose)) { - message("Initial summary: targets=", n_targets, - " | AMR genomes=", n_amr_ids, - " | genome_data genomes=", n_gmeta_ids) + message( + "Initial summary: targets=", n_targets, + " | AMR genomes=", n_amr_ids, + " | genome_data genomes=", n_gmeta_ids + ) } # Tagged view .create_amr_tagged_view(con) # Final debug summary - n_bac <- if ("bac_data" %in% DBI::dbListTables(con)) - DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM bac_data')$n else NA_integer_ - n_amr_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n + n_bac <- if ("bac_data" %in% DBI::dbListTables(con)) { + DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM bac_data')$n + } else { + NA_integer_ + } + n_amr_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n n_gmeta_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM genome_data')$n ids_zero_amr <- DBI::dbGetQuery( @@ -1183,16 +1201,21 @@ retrieveMetadata <- function(user_bacs, #' @return A list with a DuckDB connection and table_name = "filtered" .filterGenomes <- function(user_bacs, base_dir = ".", - evidence_mode = c("lab_only","lab_or_comp","comp_only","any"), + evidence_mode = c("lab_only", "lab_or_comp", "comp_only", "any"), verbose = TRUE, fallback_to_bvbrc_cache = TRUE) { evidence_mode <- match.arg(evidence_mode) base_dir <- normalizePath(base_dir, mustWork = FALSE) - paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) - db_path <- paths$db_path + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) + db_path <- paths$db_path con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path) - on.exit({ NULL }, add = TRUE) # keep open for caller + on.exit( + { + NULL + }, + add = TRUE + ) # keep open for caller # The convenient "Metadata Exists" path if ("metadata" %in% DBI::dbListTables(con)) { @@ -1251,8 +1274,8 @@ retrieveMetadata <- function(user_bacs, md <- md |> dplyr::filter(`genome.genome_quality` == "Good") |> - dplyr::filter(`genome_drug.resistant_phenotype` %in% c("Resistant","Susceptible","Intermediate")) |> - dplyr::distinct(`genome.genome_id`) # keep BV-BRC column name + dplyr::filter(`genome_drug.resistant_phenotype` %in% c("Resistant", "Susceptible", "Intermediate")) |> + dplyr::distinct(`genome.genome_id`) # keep BV-BRC column name DBI::dbWriteTable(con, "filtered", md, overwrite = TRUE) if (isTRUE(verbose)) { @@ -1283,12 +1306,12 @@ retrieveMetadata <- function(user_bacs, stop("Table 'bvbrc_bac_data' not found in BV-BRC cache: ", cache_db) } - bv <- tibble::as_tibble(DBI::dbReadTable(con_cache, "bvbrc_bac_data")) + bv <- tibble::as_tibble(DBI::dbReadTable(con_cache, "bvbrc_bac_data")) sel <- tibble::tibble(`genome.genome_id` = character()) for (v in user_bacs) { if (.id_checker(v)) { - v_chr <- as.character(v) + v_chr <- as.character(v) matches <- bv[bv$`genome.taxon_id` == v_chr, , drop = FALSE] } else { matches <- bv[stringr::str_detect( @@ -1297,8 +1320,10 @@ retrieveMetadata <- function(user_bacs, ), , drop = FALSE] } if (nrow(matches)) { - sel <- dplyr::bind_rows(sel, - tibble::tibble(`genome.genome_id` = as.character(matches$`genome.genome_id`))) + sel <- dplyr::bind_rows( + sel, + tibble::tibble(`genome.genome_id` = as.character(matches$`genome.genome_id`)) + ) } } sel <- dplyr::distinct(sel) @@ -1361,11 +1386,11 @@ retrieveMetadata <- function(user_bacs, #' @keywords internal .pick_shell <- function(image) { chk <- suppressWarnings(system2("docker", - c( - "run", "--rm", image, "sh", "-lc", - "command -v bash >/dev/null || echo NOBASH" - ), - stdout = TRUE, stderr = TRUE + c( + "run", "--rm", image, "sh", "-lc", + "command -v bash >/dev/null || echo NOBASH" + ), + stdout = TRUE, stderr = TRUE )) if (length(chk) && any(grepl("NOBASH", chk))) "sh" else "bash" } @@ -1511,7 +1536,7 @@ retrieveGenomes <- function(base_dir = ".", cli_fasta_workers = 4L, cli_gff_workers = 4L, chunk_size = 50L, - evidence_mode = c("lab_only","lab_or_comp","comp_only","any"), # NEW + evidence_mode = c("lab_only", "lab_or_comp", "comp_only", "any"), # NEW verbose = TRUE) { method <- match.arg(method) evidence_mode <- match.arg(evidence_mode) @@ -1525,13 +1550,16 @@ retrieveGenomes <- function(base_dir = ".", has_filtered <- "filtered" %in% DBI::dbListTables(con0) if (has_filtered) { if (isTRUE(verbose)) message("Using existing 'filtered' table (skipping re-filter).") - con <- con0; tbl <- "filtered" + con <- con0 + tbl <- "filtered" } else { if (isTRUE(verbose)) message("No 'filtered' table found; filtering now.") - f_out <- .filterGenomes(base_dir = base_dir, - user_bacs = user_bacs, - evidence_mode = evidence_mode, - verbose = verbose) + f_out <- .filterGenomes( + base_dir = base_dir, + user_bacs = user_bacs, + evidence_mode = evidence_mode, + verbose = verbose + ) con <- f_out$duckdbConnection tbl <- f_out$table_name } @@ -1584,7 +1612,7 @@ retrieveGenomes <- function(base_dir = ".", if (isTRUE(verbose)) { message( "CLI mode: downloading genomes in parallelized chunks. ", - "Chunks=", ceiling(length(ids)/chunk_size), + "Chunks=", ceiling(length(ids) / chunk_size), " | fasta_workers=", cli_fasta_workers, " | gff_workers=", cli_gff_workers ) @@ -1665,7 +1693,7 @@ genomeList <- function(base_dir = ".", faa_path = if (file.exists(faa_path) && file.info(faa_path)$size > 100) faa_path else NA, panaroo_input = if ( file.exists(gff_path) && file.exists(fna_path) && file.exists(faa_path) && - file.info(gff_path)$size > 100 && file.info(fna_path)$size > 100 && file.info(faa_path)$size > 100 + file.info(gff_path)$size > 100 && file.info(fna_path)$size > 100 && file.info(faa_path)$size > 100 ) { paste(gff_path, fna_path) } else { @@ -1731,7 +1759,7 @@ prepareGenomes <- function(user_bacs, base_dir = ".", method = c("ftp", "cli"), overwrite = FALSE, - evidence_mode = c("lab_only","lab_or_comp","comp_only","any"), + evidence_mode = c("lab_only", "lab_or_comp", "comp_only", "any"), verbose = TRUE) { method <- match.arg(method) evidence_mode <- match.arg(evidence_mode) @@ -1751,10 +1779,10 @@ prepareGenomes <- function(user_bacs, if (isTRUE(verbose)) message("Step 1: Filtering genomes for download by evidence: ", evidence_mode) f_out <- .filterGenomes( - base_dir = base_dir, - user_bacs = user_bacs, + base_dir = base_dir, + user_bacs = user_bacs, evidence_mode = evidence_mode, - verbose = verbose, + verbose = verbose, fallback_to_bvbrc_cache = FALSE ) if (is.null(f_out)) { @@ -1764,14 +1792,16 @@ prepareGenomes <- function(user_bacs, # A little summary of what's left after filtering (or not) paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) - con <- DBI::dbConnect(duckdb::duckdb(), dbdir = paths$db_path, read_only = TRUE) + con <- DBI::dbConnect(duckdb::duckdb(), dbdir = paths$db_path, read_only = TRUE) n_filtered <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM filtered')$n - n_meta <- if ("genome_data" %in% DBI::dbListTables(con)) DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM genome_data')$n else NA - n_amr <- if ("amr_phenotype" %in% DBI::dbListTables(con)) DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n else NA + n_meta <- if ("genome_data" %in% DBI::dbListTables(con)) DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM genome_data')$n else NA + n_amr <- if ("amr_phenotype" %in% DBI::dbListTables(con)) DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n else NA DBI::dbDisconnect(con, shutdown = TRUE) if (isTRUE(verbose)) { - message(sprintf("Evidence filter summary: filtered=%d | genomes with AMR=%s | genomes with genome_data=%s", - n_filtered, ifelse(is.na(n_amr), "NA", n_amr), ifelse(is.na(n_meta), "NA", n_meta))) + message(sprintf( + "Evidence filter summary: filtered=%d | genomes with AMR=%s | genomes with genome_data=%s", + n_filtered, ifelse(is.na(n_amr), "NA", n_amr), ifelse(is.na(n_meta), "NA", n_meta) + )) } if (isTRUE(verbose)) message("Step 2: Downloading genomes from BV-BRC (", method, ")")