|
| 1 | +use crate::utils::*; |
| 2 | +use anyhow::{Ok, Result}; |
| 3 | +use bio::io::fastq; |
| 4 | +use log::*; |
| 5 | + |
| 6 | +fn reverse_complement(seq: &[u8]) -> Vec<u8> { |
| 7 | + seq.iter() |
| 8 | + .rev() |
| 9 | + .map(|b| match b { |
| 10 | + b'A' => b'T', |
| 11 | + b'T' => b'A', |
| 12 | + b'C' => b'G', |
| 13 | + b'G' => b'C', |
| 14 | + _ => *b, // unknow base like N |
| 15 | + }) |
| 16 | + .collect() |
| 17 | +} |
| 18 | + |
| 19 | +pub fn join_overlap( |
| 20 | + read1: &str, |
| 21 | + read2: &str, |
| 22 | + max_mismatch_rate: f64, |
| 23 | + min_overlap_len: usize, |
| 24 | + overlap_merge: Option<&String>, |
| 25 | + nonoverlap_pe: Option<&String>, |
| 26 | + compression_level: u32, |
| 27 | + stdout_type: char, |
| 28 | +) -> Result<()> { |
| 29 | + |
| 30 | + let reader1 = fastq::Reader::new(file_reader(Some(read1))?); |
| 31 | + let reader2 = fastq::Reader::new(file_reader(Some(read2))?); |
| 32 | + |
| 33 | + let mut count_join = 0usize; |
| 34 | + let mut count_total = 0usize; |
| 35 | + let mut writer_single = fastq::Writer::new(file_writer(overlap_merge, compression_level, stdout_type)?); |
| 36 | + let mut nuoverlap_writer = fastq::Writer::new(file_writer(nonoverlap_pe, compression_level, stdout_type)?); |
| 37 | + |
| 38 | + for (rec1,rec2) in reader1.records().map_while(Result::ok).zip(reader2.records().map_while(Result::ok)) { |
| 39 | + count_total += 1; |
| 40 | + let max_overlap_len = rec1.seq().len().min(rec2.seq().len()); |
| 41 | + let rec2_seq_rev = reverse_complement(rec2.seq()); |
| 42 | + |
| 43 | + let mut fine_overlap_len = 0; |
| 44 | + if min_overlap_len <= max_overlap_len { |
| 45 | + for overlap_len in (min_overlap_len..=max_overlap_len).rev() { |
| 46 | + assert!(rec1.seq().len() >= overlap_len && rec2_seq_rev.len() >= overlap_len); |
| 47 | + // overlap region in PE reads |
| 48 | + let over1 = &rec1.seq()[rec1.seq().len()-overlap_len..]; |
| 49 | + let over2 = &rec2_seq_rev[..overlap_len]; |
| 50 | + |
| 51 | + let mismatch = over1.iter().zip(over2.iter()).filter(|(x,y)| x != y).count(); |
| 52 | + let mismatch_rate = mismatch as f64 / overlap_len as f64; |
| 53 | + |
| 54 | + if max_mismatch_rate < mismatch_rate { continue; } // mismatch count too much |
| 55 | + if overlap_len < min_overlap_len { continue; } // overlap length is too short |
| 56 | + // pe reads overlaped |
| 57 | + fine_overlap_len = overlap_len; |
| 58 | + break; |
| 59 | + } |
| 60 | + } |
| 61 | + |
| 62 | + // build longer single read |
| 63 | + if fine_overlap_len > 0{ |
| 64 | + count_join += 1; |
| 65 | + let mut single_seq = vec![]; |
| 66 | + let mut single_qual = vec![]; |
| 67 | + single_seq.extend_from_slice(&rec1.seq()[..rec1.seq().len()-fine_overlap_len]); |
| 68 | + single_qual.extend_from_slice(&rec1.qual()[..rec1.qual().len()-fine_overlap_len]); |
| 69 | + |
| 70 | + let overlap_r1_qual = &rec1.qual()[rec1.qual().len()-fine_overlap_len..]; |
| 71 | + let overlap_r1_seq = &rec1.seq()[rec1.seq().len()-fine_overlap_len..]; |
| 72 | + |
| 73 | + let rec2_qual_rev = rec2.qual().iter().copied().rev().collect::<Vec<u8>>(); |
| 74 | + let overlap_r2_qual = &rec2_qual_rev[..fine_overlap_len]; |
| 75 | + let overlap_r2_seq = &rec2_seq_rev[..fine_overlap_len]; |
| 76 | + |
| 77 | + for i in 0..fine_overlap_len { |
| 78 | + // Prefer R1 if quality is equal |
| 79 | + if overlap_r1_qual[i] >= overlap_r2_qual[i] { |
| 80 | + single_seq.push(overlap_r1_seq[i]); |
| 81 | + single_qual.push(overlap_r1_qual[i]); |
| 82 | + } else { |
| 83 | + single_seq.push(overlap_r2_seq[i]); |
| 84 | + single_qual.push(overlap_r2_qual[i]); |
| 85 | + }; |
| 86 | + } |
| 87 | + single_seq.extend_from_slice(&rec2_seq_rev[fine_overlap_len..]); |
| 88 | + single_qual.extend_from_slice(&rec2_qual_rev[fine_overlap_len..]); |
| 89 | + |
| 90 | + let read_long = fastq::Record::with_attrs(rec1.id(), rec1.desc(), &single_seq, &single_qual); |
| 91 | + writer_single.write_record(&read_long)?; |
| 92 | + } else { |
| 93 | + // no overlap |
| 94 | + if nonoverlap_pe.is_some() { |
| 95 | + nuoverlap_writer.write_record(&rec1)?; |
| 96 | + nuoverlap_writer.write_record(&rec2)?; |
| 97 | + } |
| 98 | + |
| 99 | + } |
| 100 | + } |
| 101 | + writer_single.flush()?; |
| 102 | + nuoverlap_writer.flush()?; |
| 103 | + |
| 104 | + let rate = count_join as f64 / count_total as f64; |
| 105 | + info!("total pe reads overlaped and joined number: {}", count_join); |
| 106 | + info!("total pe reads number: {}", count_total); |
| 107 | + info!("pe reads overlap rate: {:.2}%", rate*100.0); |
| 108 | + |
| 109 | + Ok(()) |
| 110 | +} |
| 111 | + |
0 commit comments