diff --git a/src/game/genetics/crossover.hpp b/src/game/genetics/crossover.hpp index 7bb3eaf..30a61a3 100644 --- a/src/game/genetics/crossover.hpp +++ b/src/game/genetics/crossover.hpp @@ -24,8 +24,7 @@ #include #include -namespace dna -{ +namespace dna { /** * Exchanges elements between two ranges, starting at a random offset. diff --git a/src/game/genetics/frame.hpp b/src/game/genetics/frame.hpp new file mode 100644 index 0000000..7bed2bb --- /dev/null +++ b/src/game/genetics/frame.hpp @@ -0,0 +1,136 @@ +/* + * Copyright (C) 2020 Christopher J. Howard + * + * This file is part of Antkeeper source code. + * + * Antkeeper source code is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Antkeeper source code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Antkeeper source code. If not, see . + */ + +#ifndef ANTKEEPER_DNA_FRAME_HPP +#define ANTKEEPER_DNA_FRAME_HPP + +#include + +namespace dna { + +/** + * Finds the first start codon in a sequence. + * + * @param first,last Range of elements to search. + * @param n Number of elements per codon. + * @param p Binary predicate which returns `true` if a subrange of length @p n is a start codon. + * @return Iterator to the first element in the start codon, or @p last if no start codon was found. + */ +template +InputIt find_start(InputIt first, InputIt last, Size n, BinaryPredicate p) +{ + auto length = std::distance(first, last); + + if (length >= n) + { + InputIt next = first; + std::advance(next, n); + + do + { + if (p(first, next)) + return first; + + ++first; + ++next; + --length; + } + while (length >= n); + } + + return last; +} + +/** + * Searches the range `[first, last)` for a sequence of @p n elements which satifies predicate @p p. + * + * @param first,last Range of elements to search. + * @param n Number of elements in the sequence. + * @param stride Number of elements between searches. + * @param p + */ +template +InputIt find_sequence(InputIt first, InputIt last, Size n, Size stride, BinaryPredicate p) +{ + if (auto length = std::distance(first, last); length >= n) + { + Size offset = n + stride; + InputIt next = first; + std::advance(next, n); + + do + { + if (p(first, next)) + return first; + + if (length < offset) + break; + + std::advance(first, stride); + std::advance(next, stride); + length -= offset; + } + while (1); + } + + return last; +} + +/** + * Finds the first stop codon in a sequence. + * + * @param first,last Range of elements to search. + * @param n Number of elements per codon. + * @param p Binary predicate which returns `true` if a subrange of length @p n is a stop codon. + * @return Iterator to the first element in the stop codon, or @p last if no stop codon was found. + */ +template +InputIt find_stop(InputIt first, InputIt last, Size n, BinaryPredicate p) +{ + for (auto length = std::distance(first, last); length >= n; length -= n) + { + InputIt next = first; + std::advance(next, n); + if (p(first, next)) + return first; + first = next; + } + + return last; +} + +/** + * Finds the first open reading frame (ORF) in a range of elements. + * + * @param[in,out] first Iterator to the beginning of the sequence, which will point to th + * + * @param start_p Binary predicate which returns `true` if a subrange of length @p n is a start codon. + * @param stop_p Binary predicate which returns `true` if a subrange of length @p n is a stop codon. + */ +template +void find_orf(InputIt& first, InputIt& last, Size n, BinaryPredicate1 start_p, BinaryPredicate2 stop_p) +{ + first = find_start(first, last, n, start_p); + if (first != last) + last = find_stop(first, last, n, stop_p); +} + +} // namespace dna + +#endif // ANTKEEPER_DNA_FRAME_HPP diff --git a/src/game/genetics/mutate.hpp b/src/game/genetics/mutate.hpp index fee49eb..6f71dd0 100644 --- a/src/game/genetics/mutate.hpp +++ b/src/game/genetics/mutate.hpp @@ -24,8 +24,7 @@ #include #include -namespace dna -{ +namespace dna { /** * Applies the given function to a randomly selected element in a range. diff --git a/src/game/genetics/nucleobase.cpp b/src/game/genetics/nucleobase.cpp new file mode 100644 index 0000000..4c33858 --- /dev/null +++ b/src/game/genetics/nucleobase.cpp @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2020 Christopher J. Howard + * + * This file is part of Antkeeper source code. + * + * Antkeeper source code is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Antkeeper source code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Antkeeper source code. If not, see . + */ + +#include "nucleobase.hpp" +#include + +namespace dna { +namespace base { + +/** + * Decodes an IUPAC degenerate base symbol into a bit mask representing the possible bases represented by the symbol. + * + * @param symbol IUPAC degenerate base symbol. + * @return Bit mask representing the possible bases represented by the symbol. + */ +static std::uint8_t decode(char symbol) +{ + static constexpr std::uint8_t bases[26] = + { + 0b0001, // A + 0b1110, // B + 0b0010, // C + 0b1101, // D + 0, // E + 0, // F + 0b0100, // G + 0b1011, // H + 0, // I + 0, // J + 0b1100, // K + 0, // L + 0b0011, // M + 0b1111, // N + 0, // O + 0, // P + 0, // Q + 0b0101, // R + 0b0110, // S + 0b1000, // T + 0b1000, // U + 0b0111, // V + 0b1001, // W + 0, // X + 0b1010, // Y + 0, // Z + }; + + return (symbol < 'A' || symbol > 'Z') ? 0 : bases[symbol - 'A']; +} + +char complement_rna(char symbol) +{ + static constexpr char* complements = "TVGHZZCDZZMZKNZZZYSAABWZRZ"; + return (symbol < 'A' || symbol > 'Z') ? 'Z' : complements[symbol - 'A']; +} + +char complement_dna(char symbol) +{ + static constexpr char* complements = "UVGHZZCDZZMZKNZZZYSAABWZRZ"; + return (symbol < 'A' || symbol > 'Z') ? 'Z' : complements[symbol - 'A']; +} + +char transcribe(char symbol) +{ + return (symbol == 'T') ? 'U' : (symbol == 'U') ? 'T' : symbol; +} + +int compare(char a, char b) +{ + std::uint8_t bases = decode(a) & decode(b); + return (bases & 1) + (bases >> 1 & 1) + (bases >> 2 & 1) + (bases >> 3 & 1); +} + +} // namespace base +} // namespace dna diff --git a/src/game/genetics/nucleobase.hpp b/src/game/genetics/nucleobase.hpp new file mode 100644 index 0000000..d99e31d --- /dev/null +++ b/src/game/genetics/nucleobase.hpp @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2020 Christopher J. Howard + * + * This file is part of Antkeeper source code. + * + * Antkeeper source code is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Antkeeper source code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Antkeeper source code. If not, see . + */ + +#ifndef ANTKEEPER_DNA_NUCLEOBASE_HPP +#define ANTKEEPER_DNA_NUCLEOBASE_HPP + +namespace dna { +namespace base { + +/** + * Returns the DNA complement of an IUPAC degenerate base symbol. + * + * @param symbol IUPAC degenerate base symbol. + * @return IUPAC degenerate base symbol of DNA complement. + */ +char complement_dna(char symbol); + +/** + * Returns the RNA complement of an IUPAC degenerate base symbol. + * + * @param symbol IUPAC degenerate base symbol. + * @return IUPAC degenerate base symbol of RNA complement. + */ +char complement_rna(char symbol); + +/** + * Transcribes an IUPAC degenerate base symbol between DNA and RNA, swapping `T` for `U` or `U` for `T`. + * + * @param symbol IUPAC degenerate base symbol. + * @return `U` if @p symbol was `T`, `T` if @p symbol was `U`, or `symbol` if @p symbol was neither `T` nor `U`. + */ +char transcribe(char symbol); + +/** + * Returns the number of bases that are represented by both IUPAC degenerate base symbols. + * + * @param a First IUPAC degenerate base symbol. + * @param b Second IUPAC degenerate base symbol. + * @return Number of bases represented by both symbols. + */ +int compare(char a, char b); + +} // namespace base +} // namespace dna + +#endif // ANTKEEPER_DNA_NUCLEOBASE_HPP diff --git a/src/game/genetics/transcribe.hpp b/src/game/genetics/transcribe.hpp deleted file mode 100644 index 237c3fc..0000000 --- a/src/game/genetics/transcribe.hpp +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (C) 2020 Christopher J. Howard - * - * This file is part of Antkeeper source code. - * - * Antkeeper source code is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Antkeeper source code is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Antkeeper source code. If not, see . - */ - -#ifndef ANTKEEPER_DNA_TRANSCRIBE_HPP -#define ANTKEEPER_DNA_TRANSCRIBE_HPP - -#include - -namespace dna -{ - -/** - * Transcribes DNA into RNA, replacing all occurences of `T` with `U`. - * - * @param first,last Range of DNA bases to transcribe. - */ -template -void transcribe(ForwardIt first, ForwardIt last); - -/** - * Transcribes RNA back into DNA, replacing all occurences of `U` with `T`. - * - * @param first,last Range of RNA bases to transcribe. - */ -template -void untranscribe(ForwardIt first, ForwardIt last); - -template -void transcribe(ForwardIt first, ForwardIt last) -{ - std::replace(first, last, 'T', 'U'); -} - -template -void untranscribe(ForwardIt first, ForwardIt last) -{ - std::replace(first, last, 'U', 'T'); -} - -} // namespace dna - -#endif // ANTKEEPER_DNA_TRANSCRIBE_HPP diff --git a/src/game/genetics/translate.hpp b/src/game/genetics/translate.hpp index c071cc9..e336570 100644 --- a/src/game/genetics/translate.hpp +++ b/src/game/genetics/translate.hpp @@ -20,181 +20,28 @@ #ifndef ANTKEEPER_DNA_TRANSLATE_HPP #define ANTKEEPER_DNA_TRANSLATE_HPP -#include -#include #include -namespace dna -{ - -/// DNA translation table for standard genetic code. -constexpr char* standard_code = - "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG" // Amino acid - "---M------**--*----M---------------M----------------------------" // Start/stop - "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG" // Base 1 - "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG" // Base 2 - "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"; // Base 3 +namespace dna { /** - * Translates codons into amino acids until a stop codon is read or the end of the sequence is reached. + * Divides a range into consecutive subranges of @p n elements, then applies the given function to each subrange and stores the result in another range. * - * @param first,last Range of codons to translate. - * @param t_first Beginning of the translation table. + * @param first,last Range of elements to translate. * @param d_first Beginning of the destination range. - * @return Output iterator to the amino acid in the destination range, one past the last amino acid translated. - */ -template -OutputIt translate(InputIt1 first, InputIt1 last, InputIt2 t_first, OutputIt d_first); - -/** - * Finds the first start codon in a sequence of bases. - * - * @param first,last Range of bases to search. - * @param t_first Beginning of the translation table. - * @return Iterator to the first base of the first start codon in the sequence, or @p last if no start codon is found. + * @param n Number of elements by which to divide the range. + * @param binary_op Binary operation function object that will be applied to each subrange of @p n elements. + * @return Output iterator to the element past the last element translated. */ -template -ForwardIt1 find_start(ForwardIt1 first, ForwardIt1 last, ForwardIt2 t_first); - -/** - * Finds the first stop codon in a sequence of codons. - * - * @param first,last Range of codons to search. - * @param t_first Beginning of the translation table. - * @return Iterator to the first base of the first stop codon in the sequence, or @p last if no stop codon is found. - */ -template -ForwardIt1 find_stop(ForwardIt1 first, ForwardIt1 last, ForwardIt2 t_first); - -template -ForwardIt1 find_start(ForwardIt1 first, ForwardIt1 last, ForwardIt2 t_first) +template +OutputIt translate(InputIt first, InputIt last, OutputIt d_first, Size n, BinaryOperation binary_op) { - ForwardIt1 second = first; - ++second; - ForwardIt1 third = second; - ++third; - - ForwardIt2 start_first = t_first; - std::advance(start_first, 64); - ForwardIt2 base1_first = start_first; - std::advance(base1_first, 64); - ForwardIt2 base2_first = base1_first; - std::advance(base2_first, 64); - ForwardIt2 base3_first = base2_first; - std::advance(base3_first, 64); - - if (first != last && second != last) - { - while (third != last) - { - ForwardIt2 start = start_first; - ForwardIt2 base1 = base1_first; - ForwardIt2 base2 = base2_first; - ForwardIt2 base3 = base3_first; - - for (std::uint_fast8_t i = 64; i; --i) - { - if (*start != '-' && *start != '*' && *first == *base1 && *second == *base2 && *third == *base3) - return first; - - ++start; - ++base1; - ++base2; - ++base3; - } - - first = second; - second = third; - ++third; - } - } - - return last; -} - -template -ForwardIt1 find_stop(ForwardIt1 first, ForwardIt1 last, ForwardIt2 t_first) -{ - ForwardIt1 second = first; - ++second; - ForwardIt1 third = second; - ++third; - - ForwardIt2 base1_first = t_first; - std::advance(base1_first, 128); - ForwardIt2 base2_first = base1_first; - std::advance(base2_first, 64); - ForwardIt2 base3_first = base2_first; - std::advance(base3_first, 64); - - while (first != last && second != last && third != last) - { - ForwardIt2 aa = t_first; - ForwardIt2 base1 = base1_first; - ForwardIt2 base2 = base2_first; - ForwardIt2 base3 = base3_first; - - for (std::uint_fast8_t i = 64; i; --i) - { - if (*aa == '*' && *first == *base1 && *second == *base2 && *third == *base3) - return first; - - ++aa; - ++base1; - ++base2; - ++base3; - } - - first = ++third; - second = ++third; - ++third; - } - - return last; -} - -template -OutputIt translate(InputIt1 first, InputIt1 last, InputIt2 t_first, OutputIt d_first) -{ - InputIt1 second = first; - ++second; - InputIt1 third = second; - ++third; - - InputIt2 base1_first = t_first; - std::advance(base1_first, 128); - InputIt2 base2_first = base1_first; - std::advance(base2_first, 64); - InputIt2 base3_first = base2_first; - std::advance(base3_first, 64); - - while (first != last && second != last && third != last) + for (auto length = std::distance(first, last); length >= n; length -= n) { - InputIt2 aa = t_first; - InputIt2 base1 = base1_first; - InputIt2 base2 = base2_first; - InputIt2 base3 = base3_first; - - for (std::uint_fast8_t i = 64; i; --i) - { - if (*first == *base1 && *second == *base2 && *third == *base3) - { - if (*aa == '*') - return d_first; - - *(d_first++) = *aa; - break; - } - - ++aa; - ++base1; - ++base2; - ++base3; - } - - first = ++third; - second = ++third; - ++third; + InputIt next = first; + std::advance(next, n); + *(d_first++) = binary_op(first, next); + first = next; } return d_first;