Skip to content

Commit 3aed49f

Browse files
committed
feat: [aln] add program (\@pg) entry to header
1 parent 0086c3c commit 3aed49f

File tree

1 file changed

+143
-2
lines changed

1 file changed

+143
-2
lines changed

src/alignment.rs

+143-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use std::borrow::Cow;
12
use std::cmp::{Ordering, Reverse};
23
use std::collections::BinaryHeap;
34
use std::collections::HashSet;
@@ -10,11 +11,14 @@ use rand::prelude::SliceRandom;
1011
use rand::{random, Rng, SeedableRng};
1112
use rust_htslib::bam;
1213
use rust_htslib::bam::ext::BamRecordExtensions;
13-
use rust_htslib::bam::{Format, Read};
14+
use rust_htslib::bam::header::HeaderRecord;
15+
use rust_htslib::bam::{Format, Header, Read};
1416

1517
use crate::cli::check_path_exists;
1618
use crate::Runner;
1719

20+
const RASUSA: &str = "rasusa";
21+
1822
#[derive(Debug, Parser)]
1923
#[command(author, version, about)]
2024
pub struct Alignment {
@@ -64,7 +68,11 @@ impl Runner for Alignment {
6468

6569
let mut reader =
6670
bam::IndexedReader::from_path(&self.aln).context("Failed to read alignment file")?;
67-
let header = bam::Header::from_template(reader.header());
71+
let mut header = bam::Header::from_template(reader.header());
72+
73+
// add rasusa program command line to header
74+
let program_record = self.program_entry(&header);
75+
header.push_record(&program_record);
6876

6977
let input_fmt = match infer_format_from_path(&self.aln) {
7078
Some(fmt) => fmt,
@@ -241,6 +249,55 @@ impl Runner for Alignment {
241249
}
242250
}
243251

252+
impl Alignment {
253+
/// Generates a rasusa program entry from a SAM header
254+
fn program_entry(&self, header: &Header) -> HeaderRecord {
255+
let (program_id, previous_pgid) = make_program_id_unique(header, RASUSA);
256+
257+
let mut record = HeaderRecord::new(b"PG");
258+
record.push_tag(b"ID", program_id);
259+
record.push_tag(b"PN", RASUSA);
260+
if let Some(pp) = previous_pgid {
261+
record.push_tag(b"PP", pp);
262+
}
263+
record.push_tag(b"VN", env!("CARGO_PKG_VERSION"));
264+
let cl = std::env::args().collect::<Vec<String>>().join(" ");
265+
record.push_tag(b"CL", cl);
266+
267+
record
268+
}
269+
}
270+
271+
/// Makes a program ID unique by looking for existing program records with the same ID and adding
272+
/// a suffix to the ID if necessary. Also returns the program ID of the last program in the header
273+
fn make_program_id_unique<'a>(
274+
header: &Header,
275+
program_id: &'a str,
276+
) -> (Cow<'a, str>, Option<String>) {
277+
let header_map = header.to_hashmap();
278+
let mut last_pg_id = None;
279+
let mut occurrences_of_id = 0;
280+
for (key, value) in header_map.iter() {
281+
if key == "PG" {
282+
for record in value {
283+
if let Some(id) = record.get("ID") {
284+
last_pg_id = Some(id.to_string());
285+
let id_before_last_dot = id.rfind('.').map(|i| &id[..i]).unwrap_or(id);
286+
if id_before_last_dot == program_id {
287+
occurrences_of_id += 1;
288+
}
289+
}
290+
}
291+
}
292+
}
293+
if occurrences_of_id == 0 {
294+
(Cow::Borrowed(program_id), last_pg_id)
295+
} else {
296+
let new_id = format!("{}.{}", program_id, occurrences_of_id);
297+
(Cow::Owned(new_id), last_pg_id)
298+
}
299+
}
300+
244301
/// Sorts the vector with a custom order where equal keys are randomly ordered.
245302
fn random_sort<T, K: Ord + Copy>(vec: &mut [T], key_extractor: fn(&T) -> K, mut rng: impl Rng) {
246303
vec.sort_by(|a, b| random_compare(key_extractor(a), key_extractor(b), &mut rng));
@@ -284,6 +341,8 @@ mod tests {
284341
use super::*;
285342
use assert_cmd::Command;
286343
use rand::prelude::StdRng;
344+
use rust_htslib::bam::HeaderView;
345+
287346
const SUB: &str = "aln";
288347

289348
#[test]
@@ -407,4 +466,86 @@ mod tests {
407466

408467
cmd.args(passed_args).assert().success();
409468
}
469+
470+
#[test]
471+
fn test_make_program_id_unique_no_program() {
472+
let template = HeaderView::from_bytes(b"@HD\tVN:1.6\tSO:coordinate
473+
@SQ\tSN:chromosome\tLN:5399960
474+
@PG\tID:minimap2\tPN:minimap2\tVN:2.26-r1175\tCL:minimap2 -aL --cs --MD -t 4 -x map-ont KPC2__202310.5x.fq.gz
475+
@PG\tID:samtools\tPN:samtools\tPP:minimap2\tVN:1.19.2\tCL:samtools sort -@ 4 -o KPC2__202310.5x.bam
476+
@PG\tID:samtools.1\tPN:samtools\tPP:samtools\tVN:1.19\tCL:samtools view -s 0.5 -o test.bam KPC2__202310.5x.bam");
477+
let header = Header::from_template(&template);
478+
let program_id = "rasusa";
479+
let actual = make_program_id_unique(&header, program_id);
480+
let expected = (
481+
Cow::<str>::Borrowed(program_id),
482+
Some("samtools.1".to_string()),
483+
);
484+
assert_eq!(actual, expected);
485+
}
486+
487+
#[test]
488+
fn test_make_program_id_unique_one_program_occurrence() {
489+
let template = HeaderView::from_bytes(b"@HD\tVN:1.6\tSO:coordinate
490+
@SQ\tSN:chromosome\tLN:5399960
491+
@PG\tID:minimap2\tPN:minimap2\tVN:2.26-r1175\tCL:minimap2 -aL --cs --MD -t 4 -x map-ont KPC2__202310.5x.fq.gz
492+
@PG\tID:samtools\tPN:samtools\tPP:minimap2\tVN:1.19.2\tCL:samtools sort -@ 4 -o KPC2__202310.5x.bam
493+
@PG\tID:samtools.1\tPN:samtools\tPP:samtools\tVN:1.19\tCL:samtools view -s 0.5 -o test.bam KPC2__202310.5x.bam");
494+
let header = Header::from_template(&template);
495+
let program_id = "minimap2";
496+
let actual = make_program_id_unique(&header, program_id);
497+
let expected = (
498+
Cow::<str>::Owned("minimap2.1".to_string()),
499+
Some("samtools.1".to_string()),
500+
);
501+
assert_eq!(actual, expected);
502+
}
503+
504+
#[test]
505+
fn test_make_program_id_unique_two_program_occurrences() {
506+
let template = HeaderView::from_bytes(b"@HD\tVN:1.6\tSO:coordinate
507+
@SQ\tSN:chromosome\tLN:5399960
508+
@PG\tID:minimap2\tPN:minimap2\tVN:2.26-r1175\tCL:minimap2 -aL --cs --MD -t 4 -x map-ont KPC2__202310.5x.fq.gz
509+
@PG\tID:samtools\tPN:samtools\tPP:minimap2\tVN:1.19.2\tCL:samtools sort -@ 4 -o KPC2__202310.5x.bam
510+
@PG\tID:samtools.1\tPN:samtools\tPP:samtools\tVN:1.19\tCL:samtools view -s 0.5 -o test.bam KPC2__202310.5x.bam");
511+
let header = Header::from_template(&template);
512+
let program_id = "samtools";
513+
let actual = make_program_id_unique(&header, program_id);
514+
let expected = (
515+
Cow::<str>::Owned("samtools.2".to_string()),
516+
Some("samtools.1".to_string()),
517+
);
518+
assert_eq!(actual, expected);
519+
}
520+
521+
#[test]
522+
fn test_make_program_id_unique_no_programs() {
523+
let template = HeaderView::from_bytes(
524+
b"@HD\tVN:1.6\tSO:coordinate
525+
@SQ\tSN:chromosome\tLN:5399960",
526+
);
527+
let header = Header::from_template(&template);
528+
let program_id = "samtools";
529+
let actual = make_program_id_unique(&header, program_id);
530+
let expected = (Cow::Borrowed("samtools"), None);
531+
assert_eq!(actual, expected);
532+
}
533+
534+
#[test]
535+
fn test_make_program_id_unique_program_id_startswith_same_substring() {
536+
let template = HeaderView::from_bytes(b"@HD\tVN:1.6\tSO:coordinate
537+
@SQ\tSN:chromosome\tLN:5399960
538+
@PG\tID:minimap2\tPN:minimap2\tVN:2.26-r1175\tCL:minimap2 -aL --cs --MD -t 4 -x map-ont KPC2__202310.5x.fq.gz
539+
@PG\tID:samtoolsfoo\tPN:samtools\tPP:minimap2\tVN:1.19.2\tCL:samtools sort -@ 4 -o KPC2__202310.5x.bam
540+
@PG\tID:samtools\tPN:samtools\tPP:minimap2\tVN:1.19.2\tCL:samtools sort -@ 4 -o KPC2__202310.5x.bam
541+
@PG\tID:samtoolsfoo.1\tPN:samtools\tPP:samtools\tVN:1.19\tCL:samtools view -s 0.5 -o test.bam KPC2__202310.5x.bam");
542+
let header = Header::from_template(&template);
543+
let program_id = "samtools";
544+
let actual = make_program_id_unique(&header, program_id);
545+
let expected = (
546+
Cow::<str>::Owned("samtools.1".to_string()),
547+
Some("samtoolsfoo.1".to_string()),
548+
);
549+
assert_eq!(actual, expected);
550+
}
410551
}

0 commit comments

Comments
 (0)