Skip to content

Commit d202d61

Browse files
committed
add subcommnad join for merge overlap pe reads
1 parent 013dc65 commit d202d61

File tree

4 files changed

+156
-2
lines changed

4 files changed

+156
-2
lines changed

src/cli/join.rs

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
use crate::utils::*;
2+
use anyhow::{Ok, Result};
3+
use bio::io::fastq;
4+
use log::*;
5+
6+
fn reverse_complement(seq: &[u8]) -> Vec<u8> {
7+
seq.iter()
8+
.rev()
9+
.map(|b| match b {
10+
b'A' => b'T',
11+
b'T' => b'A',
12+
b'C' => b'G',
13+
b'G' => b'C',
14+
_ => *b, // unknow base like N
15+
})
16+
.collect()
17+
}
18+
19+
pub fn join_overlap(
20+
read1: &str,
21+
read2: &str,
22+
max_mismatch_rate: f64,
23+
min_overlap_len: usize,
24+
overlap_merge: Option<&String>,
25+
nonoverlap_pe: Option<&String>,
26+
compression_level: u32,
27+
stdout_type: char,
28+
) -> Result<()> {
29+
30+
let reader1 = fastq::Reader::new(file_reader(Some(read1))?);
31+
let reader2 = fastq::Reader::new(file_reader(Some(read2))?);
32+
33+
let mut count_join = 0usize;
34+
let mut count_total = 0usize;
35+
let mut writer_single = fastq::Writer::new(file_writer(overlap_merge, compression_level, stdout_type)?);
36+
let mut nuoverlap_writer = fastq::Writer::new(file_writer(nonoverlap_pe, compression_level, stdout_type)?);
37+
38+
for (rec1,rec2) in reader1.records().map_while(Result::ok).zip(reader2.records().map_while(Result::ok)) {
39+
count_total += 1;
40+
let max_overlap_len = rec1.seq().len().min(rec2.seq().len());
41+
let rec2_seq_rev = reverse_complement(rec2.seq());
42+
43+
let mut fine_overlap_len = 0;
44+
if min_overlap_len <= max_overlap_len {
45+
for overlap_len in (min_overlap_len..=max_overlap_len).rev() {
46+
assert!(rec1.seq().len() >= overlap_len && rec2_seq_rev.len() >= overlap_len);
47+
// overlap region in PE reads
48+
let over1 = &rec1.seq()[rec1.seq().len()-overlap_len..];
49+
let over2 = &rec2_seq_rev[..overlap_len];
50+
51+
let mismatch = over1.iter().zip(over2.iter()).filter(|(x,y)| x != y).count();
52+
let mismatch_rate = mismatch as f64 / overlap_len as f64;
53+
54+
if max_mismatch_rate < mismatch_rate { continue; } // mismatch count too much
55+
if overlap_len < min_overlap_len { continue; } // overlap length is too short
56+
// pe reads overlaped
57+
fine_overlap_len = overlap_len;
58+
break;
59+
}
60+
}
61+
62+
// build longer single read
63+
if fine_overlap_len > 0{
64+
count_join += 1;
65+
let mut single_seq = vec![];
66+
let mut single_qual = vec![];
67+
single_seq.extend_from_slice(&rec1.seq()[..rec1.seq().len()-fine_overlap_len]);
68+
single_qual.extend_from_slice(&rec1.qual()[..rec1.qual().len()-fine_overlap_len]);
69+
70+
let overlap_r1_qual = &rec1.qual()[rec1.qual().len()-fine_overlap_len..];
71+
let overlap_r1_seq = &rec1.seq()[rec1.seq().len()-fine_overlap_len..];
72+
73+
let rec2_qual_rev = rec2.qual().iter().copied().rev().collect::<Vec<u8>>();
74+
let overlap_r2_qual = &rec2_qual_rev[..fine_overlap_len];
75+
let overlap_r2_seq = &rec2_seq_rev[..fine_overlap_len];
76+
77+
for i in 0..fine_overlap_len {
78+
// Prefer R1 if quality is equal
79+
if overlap_r1_qual[i] >= overlap_r2_qual[i] {
80+
single_seq.push(overlap_r1_seq[i]);
81+
single_qual.push(overlap_r1_qual[i]);
82+
} else {
83+
single_seq.push(overlap_r2_seq[i]);
84+
single_qual.push(overlap_r2_qual[i]);
85+
};
86+
}
87+
single_seq.extend_from_slice(&rec2_seq_rev[fine_overlap_len..]);
88+
single_qual.extend_from_slice(&rec2_qual_rev[fine_overlap_len..]);
89+
90+
let read_long = fastq::Record::with_attrs(rec1.id(), rec1.desc(), &single_seq, &single_qual);
91+
writer_single.write_record(&read_long)?;
92+
} else {
93+
// no overlap
94+
if nonoverlap_pe.is_some() {
95+
nuoverlap_writer.write_record(&rec1)?;
96+
nuoverlap_writer.write_record(&rec2)?;
97+
}
98+
99+
}
100+
}
101+
writer_single.flush()?;
102+
nuoverlap_writer.flush()?;
103+
104+
let rate = count_join as f64 / count_total as f64;
105+
info!("total pe reads overlaped and joined number: {}", count_join);
106+
info!("total pe reads number: {}", count_total);
107+
info!("pe reads overlap rate: {:.2}%", rate*100.0);
108+
109+
Ok(())
110+
}
111+

src/cli/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ pub mod fq2sam;
99
pub mod fqscore;
1010
pub mod gcplot;
1111
pub mod grep;
12+
pub mod join;
1213
pub mod kmer;
1314
pub mod length;
1415
pub mod mask;

src/command.rs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,36 @@ pub enum Subcli {
227227
#[arg(short = 'r', long = "out2", value_name = "FILE")]
228228
out2: String,
229229
},
230+
/// join paired end reads that are overlapping into a single longer read
231+
#[command(before_help=r"Note:
232+
1. if overlapping regions are detected, low quality bases are corrected by high quality paired bases.
233+
2. if a base is corrected, the quality value is also corrected.
234+
3. only paired end reads such as the following will be detected and merged, overlap mode:
235+
r1: GCAAGCGTTAATCGGAATTTATGGGCGTAAAGCGCACGCAGGA
236+
|\|||||||||||||||||||||||\
237+
TCTATGGGCGTAAAGCGCACGCAGGCATGCTGGGCGTAAAGCGCACGCAGGC r2: reverse complement
238+
")]
239+
join {
240+
/// input read1 fastq file
241+
#[arg(short = '1', long = "read1", value_name = "FILE")]
242+
read1: String,
243+
/// input read2 fastq file
244+
#[arg(short = '2', long = "read2", value_name = "FILE")]
245+
read2: String,
246+
/// minimum overlap length in PE reads
247+
#[arg(short = 'l', long = "length", default_value_t=30)]
248+
length: usize,
249+
/// maximum mismatch rate count in overlap region
250+
#[arg(short = 'm', long = "miss", default_value_t=0.1)]
251+
miss: f64,
252+
/// output joinde long fastq file name or write to stdout, file ending in .gz/.bz2/.xz will be compressed automatically
253+
#[arg(short = 'o', long = "output")]
254+
output: Option<String>,
255+
/// output interleaved fastq file name for non-overlap pe reads
256+
#[arg(short = 'n', long = "non-overlap", value_name = "FILE")]
257+
non: Option<String>,
258+
259+
},
230260
/// print fastq records in a range
231261
range {
232262
/// input fastq file, or read from stdin

src/main.rs

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@ mod utils;
1111
use command::*;
1212
mod cli;
1313
use cli::{
14-
barcode::*, check::*, concat::*, cutadapter::cut_adapter, filter::*, flatten::*, fq2fa::*,
14+
barcode::*, check::*, concat::*, cutadapter::*, filter::*, flatten::*, fq2fa::*,
1515
fq2sam::*, fqscore::*, gcplot::*, grep::*, kmer::*, length::*, mask::*, merge::*, plot::*,
1616
range::*, remove::*, rename::*, reverse::*, search::*, select::*, shuffle::*, size::*,
1717
slide::*, sort::*, split::*, split2::*, stats::*, subfq::*, tail::*, top::*, trimfq::*,
18-
view::*,
18+
view::*, join::*,
1919
};
2020

2121
fn main() -> Result<(), Error> {
@@ -557,6 +557,18 @@ fn main() -> Result<(), Error> {
557557
arg.stdout_type,
558558
)?;
559559
}
560+
Subcli::join { read1, read2, length, miss, output, non } => {
561+
join_overlap(
562+
&read1,
563+
&read2,
564+
miss,
565+
length,
566+
output.as_ref(),
567+
non.as_ref(),
568+
arg.compression_level,
569+
arg.stdout_type
570+
)?;
571+
}
560572
Subcli::kmer {
561573
input,
562574
size,

0 commit comments

Comments
 (0)