diff --git a/src/errors.rs b/src/errors.rs index 6108a2f..2c3fc6b 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -22,6 +22,8 @@ pub enum TranslationError { NonAsciiChar(char), #[error("bad nucleotide: {:?}", .0)] BadNucleotide(char), + #[error("bad amino acid: {:?}", .0)] + BadAminoAcid(char), #[error("unexpected ambiguous nucleotide: {:?}", .0)] UnexpectedAmbiguousNucleotide(char), #[error("not a ncbi translation table: {}", .0)] diff --git a/src/fasta.rs b/src/fasta.rs index 64e7cf8..597ec49 100644 --- a/src/fasta.rs +++ b/src/fasta.rs @@ -1339,6 +1339,19 @@ mod tests { ); } + #[test] + fn test_protein_invalid_fasta() { + // Note the missing newline between records. + assert_parse_err!( + ">Virus1\nAAAA\nAAAA>Virus2\nCCCC\nCCCC\n", + FastaParser::::default(), + Located { + line_number: 3, + error: FastaParseError::ParseError(TranslationError::BadAminoAcid('>')) + } + ); + } + #[test] fn test_to_string() { let parser = FastaParser::>::default(); diff --git a/src/rust_api.rs b/src/rust_api.rs index ca8837c..a1c98b9 100644 --- a/src/rust_api.rs +++ b/src/rust_api.rs @@ -133,13 +133,18 @@ impl TryFrom<&[u8]> for ProteinSequence { type Error = TranslationError; fn try_from(value: &[u8]) -> Result { - if value.is_ascii() { + let is_seq_char = |c| matches!(c, b'*' | b' ' | b'\t') || c.is_ascii_alphabetic(); + if let Some(&bad_aa) = value.iter().find(|&&c| !is_seq_char(c)) { + if bad_aa.is_ascii() { + Err(TranslationError::BadAminoAcid(char::from(bad_aa))) + } else { + Err(TranslationError::NonAsciiByte(bad_aa)) + } + } else { let mut vec = value.to_vec(); vec.make_ascii_uppercase(); + vec.retain(|c| *c != b' ' && *c != b'\t'); Ok(Self { amino_acids: vec }) - } else { - let first_non_ascii = *value.iter().find(|b| !b.is_ascii()).unwrap(); - Err(TranslationError::NonAsciiByte(first_non_ascii)) } } } @@ -769,19 +774,21 @@ mod tests { } #[test] - fn test_empty_spaces() { - // this test will unwrap() if it cannot parse the DNA - dna("gcantacctaangtnattag "); - dna(" gcantac\tctaangtnattag "); - dna(" gca ntac ctaangtnattag \t"); - - dna_strict("gcactacctaacgtcattag "); - dna_strict(" gcactac\tctaacgtcattag "); - dna_strict(" gca ctac ctaacgtcattag \t"); - - protein("angtnattag "); - protein(" angtnattag "); - protein(" an gtnattag \t"); + fn test_empty_spaces_are_stripped() { + let expected = dna("gcantacctaangtnattag"); + assert_eq!(dna("gcantacctaangtnattag "), expected); + assert_eq!(dna(" gcantac\tctaangtnattag "), expected); + assert_eq!(dna(" gca ntac ctaangtnattag \t"), expected); + + let expected = dna_strict("gcactacctaacgtcattag"); + assert_eq!(dna_strict("gcactacctaacgtcattag "), expected); + assert_eq!(dna_strict(" gcactac\tctaacgtcattag "), expected); + assert_eq!(dna_strict(" gca ctac ctaacgtcattag \t"), expected); + + let expected = protein("angtnattag"); + assert_eq!(protein("angtnattag "), expected); + assert_eq!(protein(" angtnattag "), expected); + assert_eq!(protein(" an gtnattag \t"), expected); } #[test]