1
+ use std:: borrow:: Cow ;
1
2
use std:: cmp:: { Ordering , Reverse } ;
2
3
use std:: collections:: BinaryHeap ;
3
4
use std:: collections:: HashSet ;
@@ -10,11 +11,14 @@ use rand::prelude::SliceRandom;
10
11
use rand:: { random, Rng , SeedableRng } ;
11
12
use rust_htslib:: bam;
12
13
use rust_htslib:: bam:: ext:: BamRecordExtensions ;
13
- use rust_htslib:: bam:: { Format , Read } ;
14
+ use rust_htslib:: bam:: header:: HeaderRecord ;
15
+ use rust_htslib:: bam:: { Format , Header , Read } ;
14
16
15
17
use crate :: cli:: check_path_exists;
16
18
use crate :: Runner ;
17
19
20
+ const RASUSA : & str = "rasusa" ;
21
+
18
22
#[ derive( Debug , Parser ) ]
19
23
#[ command( author, version, about) ]
20
24
pub struct Alignment {
@@ -64,7 +68,11 @@ impl Runner for Alignment {
64
68
65
69
let mut reader =
66
70
bam:: IndexedReader :: from_path ( & self . aln ) . context ( "Failed to read alignment file" ) ?;
67
- let header = bam:: Header :: from_template ( reader. header ( ) ) ;
71
+ let mut header = bam:: Header :: from_template ( reader. header ( ) ) ;
72
+
73
+ // add rasusa program command line to header
74
+ let program_record = self . program_entry ( & header) ;
75
+ header. push_record ( & program_record) ;
68
76
69
77
let input_fmt = match infer_format_from_path ( & self . aln ) {
70
78
Some ( fmt) => fmt,
@@ -241,6 +249,55 @@ impl Runner for Alignment {
241
249
}
242
250
}
243
251
252
+ impl Alignment {
253
+ /// Generates a rasusa program entry from a SAM header
254
+ fn program_entry ( & self , header : & Header ) -> HeaderRecord {
255
+ let ( program_id, previous_pgid) = make_program_id_unique ( header, RASUSA ) ;
256
+
257
+ let mut record = HeaderRecord :: new ( b"PG" ) ;
258
+ record. push_tag ( b"ID" , program_id) ;
259
+ record. push_tag ( b"PN" , RASUSA ) ;
260
+ if let Some ( pp) = previous_pgid {
261
+ record. push_tag ( b"PP" , pp) ;
262
+ }
263
+ record. push_tag ( b"VN" , env ! ( "CARGO_PKG_VERSION" ) ) ;
264
+ let cl = std:: env:: args ( ) . collect :: < Vec < String > > ( ) . join ( " " ) ;
265
+ record. push_tag ( b"CL" , cl) ;
266
+
267
+ record
268
+ }
269
+ }
270
+
271
+ /// Makes a program ID unique by looking for existing program records with the same ID and adding
272
+ /// a suffix to the ID if necessary. Also returns the program ID of the last program in the header
273
+ fn make_program_id_unique < ' a > (
274
+ header : & Header ,
275
+ program_id : & ' a str ,
276
+ ) -> ( Cow < ' a , str > , Option < String > ) {
277
+ let header_map = header. to_hashmap ( ) ;
278
+ let mut last_pg_id = None ;
279
+ let mut occurrences_of_id = 0 ;
280
+ for ( key, value) in header_map. iter ( ) {
281
+ if key == "PG" {
282
+ for record in value {
283
+ if let Some ( id) = record. get ( "ID" ) {
284
+ last_pg_id = Some ( id. to_string ( ) ) ;
285
+ let id_before_last_dot = id. rfind ( '.' ) . map ( |i| & id[ ..i] ) . unwrap_or ( id) ;
286
+ if id_before_last_dot == program_id {
287
+ occurrences_of_id += 1 ;
288
+ }
289
+ }
290
+ }
291
+ }
292
+ }
293
+ if occurrences_of_id == 0 {
294
+ ( Cow :: Borrowed ( program_id) , last_pg_id)
295
+ } else {
296
+ let new_id = format ! ( "{}.{}" , program_id, occurrences_of_id) ;
297
+ ( Cow :: Owned ( new_id) , last_pg_id)
298
+ }
299
+ }
300
+
244
301
/// Sorts the vector with a custom order where equal keys are randomly ordered.
245
302
fn random_sort < T , K : Ord + Copy > ( vec : & mut [ T ] , key_extractor : fn ( & T ) -> K , mut rng : impl Rng ) {
246
303
vec. sort_by ( |a, b| random_compare ( key_extractor ( a) , key_extractor ( b) , & mut rng) ) ;
@@ -284,6 +341,8 @@ mod tests {
284
341
use super :: * ;
285
342
use assert_cmd:: Command ;
286
343
use rand:: prelude:: StdRng ;
344
+ use rust_htslib:: bam:: HeaderView ;
345
+
287
346
const SUB : & str = "aln" ;
288
347
289
348
#[ test]
@@ -407,4 +466,86 @@ mod tests {
407
466
408
467
cmd. args ( passed_args) . assert ( ) . success ( ) ;
409
468
}
469
+
470
+ #[ test]
471
+ fn test_make_program_id_unique_no_program ( ) {
472
+ let template = HeaderView :: from_bytes ( b"@HD\t VN:1.6\t SO:coordinate
473
+ @SQ\t SN:chromosome\t LN:5399960
474
+ @PG\t ID:minimap2\t PN:minimap2\t VN:2.26-r1175\t CL:minimap2 -aL --cs --MD -t 4 -x map-ont KPC2__202310.5x.fq.gz
475
+ @PG\t ID:samtools\t PN:samtools\t PP:minimap2\t VN:1.19.2\t CL:samtools sort -@ 4 -o KPC2__202310.5x.bam
476
+ @PG\t ID:samtools.1\t PN:samtools\t PP:samtools\t VN:1.19\t CL:samtools view -s 0.5 -o test.bam KPC2__202310.5x.bam" ) ;
477
+ let header = Header :: from_template ( & template) ;
478
+ let program_id = "rasusa" ;
479
+ let actual = make_program_id_unique ( & header, program_id) ;
480
+ let expected = (
481
+ Cow :: < str > :: Borrowed ( program_id) ,
482
+ Some ( "samtools.1" . to_string ( ) ) ,
483
+ ) ;
484
+ assert_eq ! ( actual, expected) ;
485
+ }
486
+
487
+ #[ test]
488
+ fn test_make_program_id_unique_one_program_occurrence ( ) {
489
+ let template = HeaderView :: from_bytes ( b"@HD\t VN:1.6\t SO:coordinate
490
+ @SQ\t SN:chromosome\t LN:5399960
491
+ @PG\t ID:minimap2\t PN:minimap2\t VN:2.26-r1175\t CL:minimap2 -aL --cs --MD -t 4 -x map-ont KPC2__202310.5x.fq.gz
492
+ @PG\t ID:samtools\t PN:samtools\t PP:minimap2\t VN:1.19.2\t CL:samtools sort -@ 4 -o KPC2__202310.5x.bam
493
+ @PG\t ID:samtools.1\t PN:samtools\t PP:samtools\t VN:1.19\t CL:samtools view -s 0.5 -o test.bam KPC2__202310.5x.bam" ) ;
494
+ let header = Header :: from_template ( & template) ;
495
+ let program_id = "minimap2" ;
496
+ let actual = make_program_id_unique ( & header, program_id) ;
497
+ let expected = (
498
+ Cow :: < str > :: Owned ( "minimap2.1" . to_string ( ) ) ,
499
+ Some ( "samtools.1" . to_string ( ) ) ,
500
+ ) ;
501
+ assert_eq ! ( actual, expected) ;
502
+ }
503
+
504
+ #[ test]
505
+ fn test_make_program_id_unique_two_program_occurrences ( ) {
506
+ let template = HeaderView :: from_bytes ( b"@HD\t VN:1.6\t SO:coordinate
507
+ @SQ\t SN:chromosome\t LN:5399960
508
+ @PG\t ID:minimap2\t PN:minimap2\t VN:2.26-r1175\t CL:minimap2 -aL --cs --MD -t 4 -x map-ont KPC2__202310.5x.fq.gz
509
+ @PG\t ID:samtools\t PN:samtools\t PP:minimap2\t VN:1.19.2\t CL:samtools sort -@ 4 -o KPC2__202310.5x.bam
510
+ @PG\t ID:samtools.1\t PN:samtools\t PP:samtools\t VN:1.19\t CL:samtools view -s 0.5 -o test.bam KPC2__202310.5x.bam" ) ;
511
+ let header = Header :: from_template ( & template) ;
512
+ let program_id = "samtools" ;
513
+ let actual = make_program_id_unique ( & header, program_id) ;
514
+ let expected = (
515
+ Cow :: < str > :: Owned ( "samtools.2" . to_string ( ) ) ,
516
+ Some ( "samtools.1" . to_string ( ) ) ,
517
+ ) ;
518
+ assert_eq ! ( actual, expected) ;
519
+ }
520
+
521
+ #[ test]
522
+ fn test_make_program_id_unique_no_programs ( ) {
523
+ let template = HeaderView :: from_bytes (
524
+ b"@HD\t VN:1.6\t SO:coordinate
525
+ @SQ\t SN:chromosome\t LN:5399960" ,
526
+ ) ;
527
+ let header = Header :: from_template ( & template) ;
528
+ let program_id = "samtools" ;
529
+ let actual = make_program_id_unique ( & header, program_id) ;
530
+ let expected = ( Cow :: Borrowed ( "samtools" ) , None ) ;
531
+ assert_eq ! ( actual, expected) ;
532
+ }
533
+
534
+ #[ test]
535
+ fn test_make_program_id_unique_program_id_startswith_same_substring ( ) {
536
+ let template = HeaderView :: from_bytes ( b"@HD\t VN:1.6\t SO:coordinate
537
+ @SQ\t SN:chromosome\t LN:5399960
538
+ @PG\t ID:minimap2\t PN:minimap2\t VN:2.26-r1175\t CL:minimap2 -aL --cs --MD -t 4 -x map-ont KPC2__202310.5x.fq.gz
539
+ @PG\t ID:samtoolsfoo\t PN:samtools\t PP:minimap2\t VN:1.19.2\t CL:samtools sort -@ 4 -o KPC2__202310.5x.bam
540
+ @PG\t ID:samtools\t PN:samtools\t PP:minimap2\t VN:1.19.2\t CL:samtools sort -@ 4 -o KPC2__202310.5x.bam
541
+ @PG\t ID:samtoolsfoo.1\t PN:samtools\t PP:samtools\t VN:1.19\t CL:samtools view -s 0.5 -o test.bam KPC2__202310.5x.bam" ) ;
542
+ let header = Header :: from_template ( & template) ;
543
+ let program_id = "samtools" ;
544
+ let actual = make_program_id_unique ( & header, program_id) ;
545
+ let expected = (
546
+ Cow :: < str > :: Owned ( "samtools.1" . to_string ( ) ) ,
547
+ Some ( "samtoolsfoo.1" . to_string ( ) ) ,
548
+ ) ;
549
+ assert_eq ! ( actual, expected) ;
550
+ }
410
551
}
0 commit comments