use rand::Rng;
use std::io::{Error, ErrorKind};
use std::fs;
use std::fmt;
use std::cmp;
use std::io::{Write, BufReader, BufRead};
use rayon::prelude::*;
use std::sync::{Arc, Mutex};
pub const NUCLEOTIDE: [char;4] = ['A', 'T', 'C', 'G'];
type Index = usize;
enum Nucleotide {
A,
T,
C,
G,
}
impl Nucleotide {
fn to_char(&self) -> char {
match self {
Nucleotide::A => 'A',
Nucleotide::T => 'T',
Nucleotide::C => 'C',
Nucleotide::G => 'G',
}
}
fn index_to_nt(index: usize) -> Result<Nucleotide, Error> {
match index {
0 => Ok(Nucleotide::A),
1 => Ok(Nucleotide::T),
2 => Ok(Nucleotide::C),
3 => Ok(Nucleotide::G),
_ => Err(Error::new(ErrorKind::Other, format!["Nucleotide not found at {}.", &index])),
}
}
}
#[derive(Debug)]
pub enum LORF {
One([Index; 2]),
Many(Vec<[Index; 2]>),
}
#[derive(Debug)]
pub struct Sequence {
pub seq: String,
}
impl fmt::Display for Sequence {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.seq)
}
}
impl Sequence {
pub fn new(seq: String) -> Sequence {
Sequence{ seq: seq.to_uppercase() }
}
fn check(&self) -> bool {
true
}
pub fn find_at_index(&self, index: usize) -> Result<&str, Error> {
match self.seq.get(index..index+1) {
Some(n) => {
return Ok(n);
},
None => {
return Err(Error::new(ErrorKind::Other, format!["Nucleotide not found at {}.", &index]));
}
}
}
pub fn return_reading_frames(&self) -> Vec<Vec<&str>> {
let mut reading_frame = vec![Vec::new(), Vec::new(), Vec::new()];
for i in 0..3 {
if i > 0 {
reading_frame[i].push(&self.seq[0..i]);
}
let mut cut_seq = &self.seq[i..];
while !cut_seq.is_empty() {
let (codon, remaining_seq) = cut_seq.split_at(cmp::min(3, cut_seq.len()));
reading_frame[i].push(codon);
cut_seq = remaining_seq;
}
}
reading_frame
}
fn return_start_stop_positions(codon_list: Vec<&str>) -> (Vec<usize>, Vec<usize>){
let mut start_positons = Vec::new();
let mut stop_positions = Vec::new();
for (index, codon) in codon_list.into_iter().enumerate() {
match codon {
"ATG" => start_positons.push(index),
"TAA" | "TAG" | "TGA" => stop_positions.push(index),
_ => continue
}
}
(start_positons, stop_positions)
}
#[inline]
pub fn find_lorf(&self) -> LORF {
let reading_frames = self.return_reading_frames();
let mut lorf_list = vec![[0, 0]];
for frame in reading_frames {
let (start_positons, stop_positions) = Sequence::return_start_stop_positions(frame);
for start_index in &start_positons {
for (i, stop_index) in stop_positions.iter().enumerate() {
if start_index >= stop_index { continue }
if i > 0 {
let prev_stop_index = &stop_positions[i-1];
if start_index < prev_stop_index && prev_stop_index < stop_index { continue }
}
let orf_length = stop_index - start_index;
let lorf_length = lorf_list[0][1] - lorf_list[0][0];
if orf_length < lorf_length { continue }
if orf_length > lorf_length {
lorf_list = vec![[*start_index, *stop_index]];
continue;
}
if orf_length == lorf_length {
lorf_list.push([*start_index, *stop_index]);
continue;
}
}
}
}
if lorf_list.len() == 1 {
return LORF::One(lorf_list[0]);
}
else {
return LORF::Many(lorf_list);
}
}
#[inline]
pub fn concurrent_find_lorf(&self) -> LORF {
let reading_frames = self.return_reading_frames();
let lorf_mutex = Arc::new(Mutex::new(vec![[0, 0]]));
let mut handles = Vec::new();
for frame in reading_frames {
let (start_positons, stop_positions) = Sequence::return_start_stop_positions(frame);
let stop_positions = Arc::new(stop_positions);
let lorf_mutex = Arc::clone(&lorf_mutex);
let handle = std::thread::spawn(move || {
for start_index in start_positons {
for (i, stop_index) in (*stop_positions).iter().enumerate() {
if start_index >= *stop_index { continue }
if i > 0 {
let prev_stop_index = (*stop_positions)[i-1];
if start_index < prev_stop_index && prev_stop_index < *stop_index { continue }
}
let orf_length = stop_index - start_index;
let mut lorf_list = lorf_mutex.lock().unwrap();
let lorf_length = lorf_list[0][1] - lorf_list[0][0];
if orf_length < lorf_length { continue }
if orf_length > lorf_length {
*lorf_list = vec![[start_index, *stop_index]];
drop(lorf_list);
continue;
}
if orf_length == lorf_length {
(*lorf_list).push([start_index, *stop_index]);
drop(lorf_list);
continue;
}
}
}
});
handles.push(handle);
}
for handle in handles {
handle.join().unwrap();
}
let lorf_list = &*lorf_mutex.lock().unwrap();
if lorf_list.len() == 1 {
return LORF::One(lorf_list[0]);
}
else {
return LORF::Many(lorf_list.to_vec());
}
}
pub fn gen_random_seq(len: i64) -> Sequence {
let mut rng = rand::thread_rng();
let mut seq: String = String::new();
for _ in 0..len {
let i = rng.gen_range(0, 4);
seq.push(NUCLEOTIDE[i]);
}
Sequence{seq}
}
}
#[derive(Debug)]
pub struct FastaRecord {
pub header: String,
pub sequence: Sequence,
}
impl FastaRecord {
pub fn new(header: String, sequence: Sequence) -> FastaRecord {
FastaRecord{header, sequence}
}
}
impl fmt::Display for FastaRecord {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Header: {}\nSequence: {}", self.header, self.sequence)
}
}
pub struct FASTA {
pub name: String,
pub content: Vec<FastaRecord>,
}
impl fmt::Display for FASTA {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "File Name: {}\n", self.name);
for record in &self.content {
write!(f, "Header: {}\nSequence: {}\n\n", record.header, record.sequence);
}
Ok(())
}
}
impl FASTA {
fn new(name: String, content: Vec<FastaRecord>) -> FASTA {
FASTA{name, content}
}
pub fn read_fasta(path: &str) -> FASTA {
let data = fs::read_to_string(path).unwrap();
let data: Vec<&str> = data.split('>').collect();
let mut records: Vec<FastaRecord> = Vec::new();
for entry in data {
if entry.is_empty() {continue}
let mut entry: Vec<&str> = entry.split("\n").collect();
let header = entry.remove(0);
let mut sequence: String = entry.into_iter().collect();
sequence = sequence.replace("\n", "").replace("\r", "");
let sequence = Sequence::new(sequence);
records.push(FastaRecord::new(header.to_string(), sequence));
}
FASTA::new(path.to_string(), records)
}
pub fn slow_read_fasta(path: &str) -> FASTA {
let file = fs::File::open(path).expect("path to file not found");
let reader = BufReader::new(file);
let mut records = Vec::new();
let mut temp_header = "".to_string();
let mut temp_seq = "".to_string();
for (index, line) in reader.lines().enumerate() {
let line = line.unwrap();
if line.is_empty() {continue}
if line.contains('>') {
if index > 0 {
records.push(FastaRecord::new(temp_header, Sequence::new(temp_seq.to_owned())));
}
temp_header = line;
continue;
}
temp_seq.push_str(&line);
}
records.push(FastaRecord::new(temp_header, Sequence::new(temp_seq.to_owned())));
FASTA::new(path.to_string(), records)
}
pub fn rayon_read_fasta(path: &str) -> FASTA {
let data = fs::read_to_string(path).unwrap();
let data: Vec<&str> = data.split('>').collect();
let mut records: Vec<FastaRecord> = Vec::new();
for entry in data {
if entry.is_empty() {continue}
let mut entry: Vec<&str> = entry.par_lines().collect();
let header = entry.remove(0);
let mut sequence: String = entry.into_iter().collect();
sequence = sequence.replace("\n", "").replace("\r", "");
let sequence = Sequence::new(sequence);
records.push(FastaRecord::new(header.to_string(), sequence));
}
FASTA::new(path.to_string(), records)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[ignore]
fn test_sequence_struct() {
let sequence = Sequence::gen_random_seq(1000);
assert_eq!(sequence.seq.len(), 1000);
println!("Ok");
assert![sequence.find_at_index(10).is_ok()];
println!("Ok");
assert![sequence.find_at_index(2000).is_err()];
println!("Ok");
println!("{}", sequence);
sequence.return_reading_frames();
}
#[test]
fn lorf() {
let sequence = Sequence::new("ATGGGAATGTGA".to_string());
let lorf = sequence.find_lorf();
println!("{:?}", lorf);
match lorf {
LORF::One(value) => assert!(value == [0, 3]),
_ => panic!("at the disco"),
}
let lorf_concurrent = sequence.concurrent_find_lorf();
println!("{:?}", lorf_concurrent);
match lorf_concurrent {
LORF::One(value) => assert!(value == [0, 3]),
_ => panic!("at the disco"),
}
}
#[test]
fn compare_lorf_methods() {
let long_sequence = Sequence::gen_random_seq(10000);
let lorf = long_sequence.find_lorf();
println!("{:?}", lorf);
let lorf_concurrent = long_sequence.concurrent_find_lorf();
println!("{:?}", lorf_concurrent);
}
#[test]
fn test_rayon_fasta() {
let fasta = FASTA::rayon_read_fasta("data/haha-1.fasta");
println!("{}", fasta);
}
}