💿🐜 Antkeeper source code https://antkeeper.com
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

360 lines
11 KiB

  1. /*
  2. * Copyright (C) 2021 Christopher J. Howard
  3. *
  4. * This file is part of Antkeeper source code.
  5. *
  6. * Antkeeper source code is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation, either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * Antkeeper source code is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with Antkeeper source code. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #ifndef ANTKEEPER_GENETICS_SEQUENCE_HPP
  20. #define ANTKEEPER_GENETICS_SEQUENCE_HPP
  21. #include "base.hpp"
  22. #include "codon.hpp"
  23. #include <algorithm>
  24. #include <iterator>
  25. #include <random>
  26. namespace genetics {
  27. /// Functions and structures related to sequences of IUPAC degenerate base symbols.
  28. namespace sequence {
  29. /**
  30. * Open reading frame (ORF), defined by a start codon and stop codon, with the distance between divisible by three.
  31. *
  32. * @tparam Iterator Sequence iterator type.
  33. */
  34. template <class Iterator>
  35. struct orf
  36. {
  37. /// Iterator to the first base of the start codon.
  38. Iterator start;
  39. /// Iterator to the first base of the stop codon.
  40. Iterator stop;
  41. };
  42. /**
  43. * Exchanges elements between two ranges, starting at a random offset.
  44. *
  45. * @param first1,last1 First range of elements to crossover.
  46. * @param first2 Beginning of the second range of elements to crossover.
  47. * @param g Uniform random bit generator.
  48. * @return Iterator to the start of the crossover in the second range.
  49. */
  50. template <class ForwardIt1, class ForwardIt2, class URBG>
  51. ForwardIt2 crossover(ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2, URBG&& g);
  52. /**
  53. * Exchanges elements between two ranges multiple times, starting at a random offset each time.
  54. *
  55. * @param first1,last1 First range of elements to crossover.
  56. * @param first2 Beginning of the second range of elements to crossover.
  57. * @param count Number of times to crossover.
  58. * @param g Uniform random bit generator.
  59. */
  60. template <class ForwardIt1, class ForwardIt2, class Size, class URBG>
  61. void crossover_n(ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2, Size count, URBG&& g);
  62. /**
  63. * Searches a sequence for an open reading frame (ORF).
  64. *
  65. * @param first,last Range of elements to search.
  66. * @param table Genetic code translation table.
  67. * @return First ORF in the sequence, or `{last, last}` if no ORF was found.
  68. */
  69. template <class ForwardIt>
  70. orf<ForwardIt> find_orf(ForwardIt first, ForwardIt last, const codon::table& table);
  71. /**
  72. * Applies the given function to a randomly selected element in a range.
  73. *
  74. * @param first,last Range of elements to mutate.
  75. * @param unary_op Unary operation function object that will be applied.
  76. * @param g Uniform random bit generator.
  77. * @return Iterator to the mutated element.
  78. */
  79. template <class ForwardIt, class UnaryOperation, class URBG>
  80. ForwardIt mutate(ForwardIt first, ForwardIt last, UnaryOperation unary_op, URBG&& g);
  81. /**
  82. * Applies the given function to a random selection of elements in a range.
  83. *
  84. * @param first,last Range of elements to mutate.
  85. * @param count Number of elements to mutate.
  86. * @param unary_op Unary operation function object that will be applied.
  87. * @param g Uniform random bit generator.
  88. */
  89. template <class ForwardIt, class Size, class UnaryOperation, class URBG>
  90. void mutate_n(ForwardIt first, ForwardIt last, Size count, UnaryOperation unary_op, URBG&& g);
  91. /**
  92. * Searches a sequence of IUPAC base symbols for a pattern matching a search string of IUPAC degenerate base symbols.
  93. *
  94. * @param first,last Sequence of IUPAC base symbols to search.
  95. * @param s_first,s_last Search string of IUPAC degenerate base symbols.
  96. * @param stride Distance between consecutive searches.
  97. * @return Iterator to the beginning of the first subsequence matching `[s_first, s_last)` in the sequence `[first, last)`. If no such occurrence is found, @p last is returned.
  98. */
  99. template <class ForwardIt1, class ForwardIt2>
  100. ForwardIt1 search(ForwardIt1 first, ForwardIt1 last, ForwardIt2 s_first, ForwardIt2 s_last, typename std::iterator_traits<ForwardIt1>::difference_type stride);
  101. /**
  102. * Transcribes a sequence of IUPAC base symbols between DNA and RNA, swapping `T` for `U` or `U` for `T`.
  103. *
  104. * @param first,last Range of elements to transcribe.
  105. * @param d_first Beginning of the destination range.
  106. * @return Output iterator to the element past the last element transcribed.
  107. */
  108. template <class InputIt, class OutputIt>
  109. OutputIt transcribe(InputIt first, InputIt last, OutputIt d_first);
  110. /**
  111. * Translates a sequence of codons into amino acids.
  112. *
  113. * @param first,last Open reading frame.
  114. * @param d_first Beginning of destination range.
  115. * @param table Genetic code translation table.
  116. * @return Output iterator to the element past the last element translated.
  117. */
  118. template <class InputIt, class OutputIt>
  119. OutputIt translate(InputIt first, InputIt last, OutputIt d_first, const codon::table& table);
  120. /// Functions which operate on sequences of IUPAC degenerate **DNA** base symbols.
  121. namespace dna
  122. {
  123. /**
  124. * Generates the complementary sequence for a sequence of IUPAC degenerate DNA base symbols.
  125. *
  126. * @param first,last Range of elements to complement.
  127. * @param d_first Beginning of the destination range.
  128. * @return Output iterator to the element past the last element complemented.
  129. */
  130. template <class InputIt, class OutputIt>
  131. OutputIt complement(InputIt first, InputIt last, OutputIt d_first);
  132. }
  133. /// Functions which operate on sequences of IUPAC degenerate **RNA** base symbols.
  134. namespace rna
  135. {
  136. /**
  137. * Generates the complementary sequence for a sequence of IUPAC degenerate RNA base symbols.
  138. *
  139. * @param first,last Range of elements to complement.
  140. * @param d_first Beginning of the destination range.
  141. * @return Output iterator to the element past the last element complemented.
  142. */
  143. template <class InputIt, class OutputIt>
  144. OutputIt complement(InputIt first, InputIt last, OutputIt d_first);
  145. }
  146. template <class ForwardIt1, class ForwardIt2, class URBG>
  147. ForwardIt2 crossover(ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2, URBG&& g)
  148. {
  149. typedef typename std::iterator_traits<ForwardIt1>::difference_type difference_t;
  150. std::uniform_int_distribution<difference_t> distribution(0, std::distance(first1, last1) - 1);
  151. difference_t pos = distribution(g);
  152. std::advance(first1, pos);
  153. std::advance(first2, pos);
  154. std::swap_ranges(first1, last1, first2);
  155. return first2;
  156. }
  157. template <class ForwardIt1, class ForwardIt2, class Size, class URBG>
  158. void crossover_n(ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2, Size count, URBG&& g)
  159. {
  160. typedef typename std::iterator_traits<ForwardIt1>::difference_type difference_t;
  161. std::uniform_int_distribution<difference_t> distribution(0, std::distance(first1, last1) - 1);
  162. ForwardIt1 crossover1, crossover2;
  163. while (count)
  164. {
  165. crossover1 = first1;
  166. crossover2 = first2;
  167. difference_t pos = distribution(g);
  168. std::advance(crossover1, pos);
  169. std::advance(crossover2, pos);
  170. std::swap_ranges(crossover1, last1, crossover2);
  171. --count;
  172. }
  173. }
  174. template <class ForwardIt>
  175. orf<ForwardIt> find_orf(ForwardIt first, ForwardIt last, const codon::table& table)
  176. {
  177. ForwardIt second;
  178. ForwardIt third;
  179. orf<ForwardIt> result;
  180. auto distance = std::distance(first, last);
  181. if (distance >= 3)
  182. {
  183. second = first;
  184. ++second;
  185. third = second;
  186. ++third;
  187. do
  188. {
  189. if (codon::is_start(*first, *second, *third, table.starts))
  190. {
  191. result.start = first;
  192. distance -= 3;
  193. break;
  194. }
  195. first = second;
  196. second = third;
  197. ++third;
  198. --distance;
  199. }
  200. while (third != last);
  201. }
  202. for (; distance >= 3; distance -= 3)
  203. {
  204. first = ++third;
  205. second = ++third;
  206. ++third;
  207. if (codon::is_stop(*first, *second, *third, table.aas))
  208. {
  209. result.stop = first;
  210. return result;
  211. }
  212. }
  213. return {last, last};
  214. }
  215. template <class ForwardIt, class UnaryOperation, class URBG>
  216. ForwardIt mutate(ForwardIt first, ForwardIt last, UnaryOperation unary_op, URBG&& g)
  217. {
  218. typedef typename std::iterator_traits<ForwardIt>::difference_type difference_t;
  219. if (first == last)
  220. return first;
  221. std::uniform_int_distribution<difference_t> distribution(0, std::distance(first, last) - 1);
  222. std::advance(first, distribution(g));
  223. *first = unary_op(*first);
  224. return first;
  225. }
  226. template <class ForwardIt, class Size, class UnaryOperation, class URBG>
  227. void mutate_n(ForwardIt first, ForwardIt last, Size count, UnaryOperation unary_op, URBG&& g)
  228. {
  229. typedef typename std::iterator_traits<ForwardIt>::difference_type difference_t;
  230. if (first == last)
  231. return first;
  232. std::uniform_int_distribution<difference_t> distribution(0, std::distance(first, last) - 1);
  233. ForwardIt mutation;
  234. while (count)
  235. {
  236. mutation = first;
  237. std::advance(mutation, distribution(g));
  238. *mutation = unary_op(*mutation);
  239. --count;
  240. }
  241. }
  242. template <class ForwardIt1, class ForwardIt2>
  243. ForwardIt1 search(ForwardIt1 first, ForwardIt1 last, ForwardIt2 s_first, ForwardIt2 s_last, typename std::iterator_traits<ForwardIt1>::difference_type stride)
  244. {
  245. for (auto distance = std::distance(first, last); distance > 0; distance -= stride)
  246. {
  247. ForwardIt1 it = first;
  248. for (ForwardIt2 s_it = s_first; ; ++it, ++s_it)
  249. {
  250. if (s_it == s_last)
  251. return first;
  252. if (it == last)
  253. return last;
  254. if (!base::compare(*it, *s_it))
  255. break;
  256. }
  257. if (distance > stride)
  258. std::advance(first, stride);
  259. }
  260. return last;
  261. }
  262. template <class InputIt, class OutputIt>
  263. inline OutputIt transcribe(InputIt first, InputIt last, OutputIt d_first)
  264. {
  265. return std::transform(first, last, d_first, base::transcribe);
  266. }
  267. template <class InputIt, class OutputIt>
  268. OutputIt translate(InputIt first, InputIt last, OutputIt d_first, const codon::table& table)
  269. {
  270. auto length = std::distance(first, last);
  271. if (length >= 3)
  272. {
  273. InputIt second = first;
  274. ++second;
  275. InputIt third = second;
  276. ++third;
  277. *(d_first++) = codon::translate(*first, *second, *third, table.starts);
  278. for (length -= 3; length >= 3; length -= 3)
  279. {
  280. first = ++third;
  281. second = ++third;
  282. ++third;
  283. *(d_first++) = codon::translate(*first, *second, *third, table.aas);
  284. }
  285. }
  286. return d_first;
  287. }
  288. namespace dna
  289. {
  290. template <class InputIt, class OutputIt>
  291. inline OutputIt complement(InputIt first, InputIt last, OutputIt d_first)
  292. {
  293. return std::transform(first, last, d_first, base::dna::complement);
  294. }
  295. }
  296. namespace rna
  297. {
  298. template <class InputIt, class OutputIt>
  299. inline OutputIt complement(InputIt first, InputIt last, OutputIt d_first)
  300. {
  301. return std::transform(first, last, d_first, base::rna::complement);
  302. }
  303. }
  304. } // namespace sequence
  305. } // namespace genetics
  306. #endif // ANTKEEPER_GENETICS_SEQUENCE_HPP