Source file bam.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
module Result = Biocaml_result
open CFStream
open Or_error
let check b msg =
if b then Ok ()
else error_string msg
let checkf b format = Printf.ksprintf (check b) format
let check_buf ~buf ~pos ~len =
check (String.length buf >= pos + len) "Buffer too short"
let get_8_0 =
let mask = Int32.of_int_exn 0xff in
fun x -> Int32.bit_and mask x |> Int32.to_int_exn
let get_16_0 =
let mask = Int32.of_int_exn 0xffff in
fun x -> Int32.bit_and mask x |> Int32.to_int_exn
let get_16_8 x =
get_8_0 (Int32.shift_right x 8)
let get_32_4 x =
Int32.shift_right x 4 |> Int32.to_int_exn
let get_4_0 =
let mask = Int32.of_int_exn 0xf in
fun x -> Int32.bit_and mask x |> Int32.to_int_exn
module BP = struct
let unpack_unsigned_8 ~buf ~pos = Char.to_int (String.get buf pos)
let unpack_signed_8 ~buf ~pos =
let n = unpack_unsigned_8 ~buf ~pos in
if n >= 0x80 then -(0x100 - n) else n
let unpack_signed_32_little_endian ~buf ~pos =
let b1 =
Int32.shift_left
(Int32.of_int_exn (Caml.Char.code (String.get buf (pos + 3))))
24
in
let b2 = Char.to_int (String.get buf (pos + 2)) lsl 16 in
let b3 = Char.to_int (String.get buf (pos + 1)) lsl 8 in
let b4 = Char.to_int (String.get buf pos) in
Int32.bit_or b1 (Int32.of_int_exn (b2 lor b3 lor b4))
let unpack_unsigned_16_little_endian ~buf ~pos =
let b1 = Char.to_int (String.get buf (pos + 1)) lsl 8 in
let b2 = Char.to_int (String.get buf pos) in
b1 lor b2
let unpack_signed_16_little_endian ~buf ~pos =
let n = unpack_unsigned_16_little_endian ~buf ~pos in
if n >= 0x8000 then -(0x10000 - n) else n
let unpack_unsigned_32_little_endian ~buf ~pos =
let b1 = Char.to_int (String.get buf (pos + 0)) |> Int64.of_int_exn in
let b2 = Char.to_int (String.get buf (pos + 1)) in
let b3 = Char.to_int (String.get buf (pos + 2)) in
let b4 = Char.to_int (String.get buf (pos + 3)) in
Int64.bit_or
(Int64.shift_left b1 24)
(Int64.of_int_exn (b2 lor b3 lor b4))
let unpack_signed_64_little_endian ~buf ~pos =
Int64.bit_or
(Int64.bit_or
(Int64.shift_left
(Int64.of_int_exn (
((Char.to_int (String.get buf (pos + 7)) lsl 16))
lor (Char.to_int (String.get buf (pos + 6)) lsl 8)
lor Char.to_int (String.get buf (pos + 5))))
40)
(Int64.shift_left
(Int64.of_int_exn
((Char.to_int (String.get buf (pos + 4)) lsl 16)
lor (Char.to_int (String.get buf (pos + 3)) lsl 8)
lor Char.to_int (String.get buf (pos + 2))))
16))
(Int64.of_int_exn (
(Char.to_int (String.get buf (pos + 1)) lsl 8) lor Char.to_int (String.get buf pos)))
let unpack_float_little_endian ~buf ~pos =
Int64.float_of_bits (unpack_signed_64_little_endian ~buf ~pos)
let pack_signed_32_little_endian bits =
let buf = Bytes.create 4 in
Binary_packing.pack_signed_32 ~byte_order:`Little_endian bits ~buf ~pos:0 ;
Bytes.unsafe_to_string ~no_mutation_while_string_reachable:buf
let pack_u32_in_int64_little_endian v =
let buf = Bytes.create 4 in
Bytes.set buf 0 (Char.unsafe_of_int Int64.(to_int_exn (bit_and 0xFFL v))) ;
Bytes.set buf 3 (Char.unsafe_of_int Int64.(to_int_exn (bit_and 0xFFL (shift_right_logical v 24)))) ;
Bytes.unsafe_set buf 1
(Char.unsafe_of_int Int64.(to_int_exn (bit_and 0xFFL (shift_right_logical v 8)))) ;
Bytes.unsafe_set buf 2
(Char.unsafe_of_int Int64.(to_int_exn (bit_and 0xFFL (shift_right_logical v 16)))) ;
Bytes.unsafe_to_string ~no_mutation_while_string_reachable:buf
end
let int64_is_neg n =
Int64.(bit_and 0x8000000000000000L n <> zero)
let int64_fits_u32 n =
Int64.(bit_and 0xFFFFFFFF00000000L n = zero)
let int64_fits_u31 n =
Int64.(bit_and 0xFFFFFFFF80000000L n = zero)
let int64_fits_s32 n =
if int64_is_neg n then
int64_fits_u31 (Int64.bit_not n)
else
int64_fits_u31 n
let result_list_init n ~f =
let rec aux i accu =
if i < 0 then Ok accu
else (
match f i with
| Ok y -> aux (i - 1) (y :: accu)
| Error _ as e -> e
)
in
aux (n - 1) []
open Header
type alignment = Sam.alignment
module Alignment0 = struct
type t = {
ref_id : int ;
pos : int ;
bin_mq_nl : int32 ;
flag_nc : int32 ;
next_ref_id : int ;
pnext : int ;
tlen : int ;
read_name : string ;
cigar : string ;
seq : string ;
qual : string ;
optional : string ;
} [@@deriving sexp]
let option ~none x =
if Poly.(x = none) then None
else Some x
let ref_id al = option ~none:(- 1) al.ref_id
let qname al = option ~none:"*" al.read_name
let flags al =
Int32.shift_right al.flag_nc 16
|> Int32.to_int_exn
|> Sam.Flags.of_int
let rname al =
try Ok (Option.map (ref_id al) ~f:(fun id -> (Array.get header.ref_seq id).Sam.name))
with _ -> error_string "Bam.Alignment0.rname: unknown ref_id"
let pos al = option ~none:(- 1) al.pos
let mapq al = option ~none:255 (get_16_8 al.bin_mq_nl)
let cigar_op_of_s32 x : Sam.cigar_op Or_error.t =
let open Sam in
let op_len = get_32_4 x in
match get_4_0 x with
| 0 -> cigar_op_alignment_match op_len
| 1 -> cigar_op_insertion op_len
| 2 -> cigar_op_deletion op_len
| 3 -> cigar_op_skipped op_len
| 4 -> cigar_op_soft_clipping op_len
| 5 -> cigar_op_hard_clipping op_len
| 6 -> cigar_op_padding op_len
| 7 -> cigar_op_seq_match op_len
| 8 -> cigar_op_seq_mismatch op_len
| _ -> assert false
let cigar al =
result_list_init (String.length al.cigar / 4) ~f:(fun i ->
let s32 = BP.unpack_signed_32_little_endian ~buf:al.cigar ~pos:(i * 4) in
cigar_op_of_s32 s32
)
let rnext al =
match al.next_ref_id with
| -1 -> return None
| i -> Sam.parse_rnext header.ref_seq.(i).Sam.name
let pnext al = option ~none:(- 1) al.pnext
let tlen al = option ~none:0 al.tlen
let l_seq al = String.length al.qual
let char_of_seq_code = function
| 0 -> '='
| 1 -> 'A'
| 2 -> 'C'
| 3 -> 'M'
| 4 -> 'G'
| 5 -> 'R'
| 6 -> 'S'
| 7 -> 'V'
| 8 -> 'T'
| 9 -> 'W'
| 10 -> 'Y'
| 11 -> 'H'
| 12 -> 'K'
| 13 -> 'D'
| 14 -> 'B'
| 15 -> 'N'
| l -> failwithf "letter not in [0, 15]: %d" l ()
let seq al =
let n = String.length al.seq in
if n = 0 then None
else
let l_seq = l_seq al in
let r = Bytes.make l_seq ' ' in
for i = 0 to n - 1 do
let c = int_of_char al.seq.[i] in
Bytes.set r (2 * i) (char_of_seq_code (c lsr 4)) ;
if 2 * i + 1 < l_seq then Bytes.set r (2 * i + 1) (char_of_seq_code (c land 0xf)) ;
done ;
Some (Bytes.unsafe_to_string ~no_mutation_while_string_reachable:r)
let qual al =
let shift = String.map ~f:Char.(fun c -> of_int_exn (to_int c + 33)) in
match shift al.qual with
| qual33 -> Sam.parse_qual qual33
| exception Failure _ ->
Or_error.error
"Bam.Alignement0.qual: incorrect quality score"
al.qual
sexp_of_string
(** Extracts string in buf starting from pos and finishing with a
NULL character, and returns the position just after it. Returns
an error if no NULL character is encountered before the end of
the string. *)
let parse_cstring buf pos =
let rec aux i =
if i < String.length buf then
if Char.(String.get buf i = '\000') then return i
else aux (i + 1)
else error_string "Unfinished NULL terminated string"
in
aux pos >>= fun pos' ->
return (String.sub buf ~pos ~len:(pos' - pos), pos' + 1)
let cCsSiIf_size = function
| 'c'
| 'C' -> return 1
| 's' | 'S' -> return 2
| 'i' | 'I'
| 'f' -> return 4
| _ -> error_string "Incorrect numeric optional field type identifier"
let parse_cCsSiIf buf pos typ =
cCsSiIf_size typ >>= fun len ->
check_buf ~buf ~pos ~len >>= fun () ->
match typ with
| 'c' ->
let i = Int64.of_int_exn (BP.unpack_signed_8 ~buf ~pos) in
return (Sam.optional_field_value_i i, len)
| 'C' ->
let i = Int64.of_int_exn (BP.unpack_unsigned_8 ~buf ~pos) in
return (Sam.optional_field_value_i i, len)
| 's' ->
let i = Int64.of_int_exn (BP.unpack_signed_16_little_endian ~buf ~pos) in
return (Sam.optional_field_value_i i, len)
| 'S' ->
let i = Int64.of_int_exn (BP.unpack_unsigned_16_little_endian ~buf ~pos) in
return (Sam.optional_field_value_i i, len)
| 'i' ->
let i = BP.unpack_signed_32_little_endian ~buf ~pos in
return (Sam.optional_field_value_i (Int64.of_int32 i), len)
| 'I' ->
let i = BP.unpack_unsigned_32_little_endian ~buf ~pos in
return (Sam.optional_field_value_i i, len)
| 'f' ->
let f = BP.unpack_float_little_endian ~buf ~pos in
return (Sam.optional_field_value_f f, len)
| _ -> error_string "Incorrect numeric optional field type identifier"
let parse_optional_field_value buf pos = function
| 'A' ->
check_buf ~buf ~pos ~len:1 >>= fun () ->
Sam.optional_field_value_A (String.get buf pos) >>= fun v ->
return (v, 1)
| 'c' | 'C' | 's' | 'S' | 'i' | 'I' | 'f' as typ ->
parse_cCsSiIf buf pos typ
| 'Z' ->
parse_cstring buf pos >>= fun (s, pos') ->
Sam.optional_field_value_Z s >>= fun value ->
return (value, pos' - pos)
| 'H' ->
parse_cstring buf pos >>= fun (s, pos') ->
Sam.optional_field_value_H s >>= fun value ->
return (value, pos' - pos)
| 'B' ->
check_buf ~buf ~pos ~len:5 >>= fun () ->
let typ = String.get buf 0 in
let n = BP.unpack_signed_32_little_endian ~buf ~pos:(pos + 1) in
(
match Int32.to_int n with
| Some n ->
cCsSiIf_size typ >>= fun elt_size ->
check_buf ~buf ~pos:(pos + 5) ~len:(n * elt_size) >>= fun () ->
let elts = List.init n ~f:(fun i -> String.sub buf ~pos:(pos + 5 + i * elt_size) ~len:elt_size) in
let bytes_read = 5 + elt_size * n in
Sam.optional_field_value_B typ elts >>= fun value ->
return (value, bytes_read)
| None ->
error_string "Too many elements in B-type optional field"
)
| c -> error "Incorrect optional field type identifier" c [%sexp_of: char]
let parse_optional_field buf pos =
check_buf ~buf ~pos ~len:3 >>= fun () ->
let tag = String.sub buf ~pos ~len:2 in
let field_type = buf.[pos + 2] in
parse_optional_field_value buf (pos + 3) field_type >>= fun (field_value, shift) ->
Sam.optional_field tag field_value >>= fun field ->
return (field, shift + 3)
let parse_optional_fields buf =
let rec loop buf pos accu =
if pos = String.length buf then return (List.rev accu)
else
parse_optional_field buf pos >>= fun (field, used_chars) ->
loop buf (pos + used_chars) (field :: accu)
in
loop buf 0 []
let optional_fields al =
tag (parse_optional_fields al.optional) ~tag:"Bam.Alignment0.optional_fields"
let decode al =
flags al >>= fun flags ->
rname al header >>= fun rname ->
cigar al >>= fun cigar ->
rnext al header >>= fun rnext ->
qual al >>= fun qual ->
optional_fields al >>= fun optional_fields ->
Sam.alignment
?qname:(qname al) ~flags ?rname ?pos:(pos al) ?mapq:(mapq al)
~cigar ?rnext ?pnext:(pnext al) ?tlen:(tlen al)
?seq:(seq al) ~qual ~optional_fields
()
let find_ref_id ref_name =
let open Or_error in
match Array.findi header.ref_seq ~f:(fun _ rs -> String.(rs.Sam.name = ref_name)) with
| Some (i, _) -> Ok i
| None -> error_string "Bam: unknown reference id"
let string_of_cigar_ops cigar_ops =
let buf = Bytes.create (List.length cigar_ops * 4) in
let write ith i32 =
let pos = ith * 4 in
Binary_packing.pack_signed_32 ~byte_order:`Little_endian ~buf ~pos i32 in
let open Int32 in
List.iteri cigar_ops ~f:(fun idx op ->
let _, i = match op with
| `Alignment_match i -> 0l, i
| `Insertion i -> 1l, i
| `Deletion i -> 2l, i
| `Skipped i -> 3l, i
| `Soft_clipping i -> 4l, i
| `Hard_clipping i -> 5l, i
| `Padding i -> 6l, i
| `Seq_match i -> 7l, i
| `Seq_mismatch i -> 8l, i
in
write idx (bit_or 0l (of_int_exn Stdlib.(i lsl 4)))
) ;
Bytes.unsafe_to_string ~no_mutation_while_string_reachable:buf
let sizeof al =
let l_seq = l_seq al in
8 * 4
+ String.length al.read_name + 1
+ String.length al.cigar
+ (l_seq + 1) / 2
+ l_seq
+ String.length al.optional
let reg2bin st ed =
match st, ed with
| b, e when b lsr 14 = e lsr 14 ->
((1 lsl 15) - 1) / 7 + (st lsr 14)
| b, e when b lsr 17 = e lsr 17 ->
((1 lsl 12) - 1) / 7 + (st lsr 17)
| b, e when b lsr 20 = e lsr 20 ->
((1 lsl 9) - 1) / 7 + (st lsr 20)
| b, e when b lsr 23 = e lsr 23 ->
((1 lsl 6) - 1) / 7 + (st lsr 23)
| b, e when b lsr 26 = e lsr 26 ->
((1 lsl 3) - 1) / 7 + (st lsr 26)
| _ -> 0
let string_of_optional_fields opt_fields =
let field_value_encoding = function
| `B (typ, xs) ->
Ok ('B',
sprintf "%c%s" typ (String.concat ~sep:"" xs))
| `A c ->
Ok ('A', Char.to_string c)
| `f f ->
let bits = Int32.bits_of_float f in
Ok ('f', BP.pack_signed_32_little_endian bits)
| `i i ->
if int64_fits_u32 i then (
Ok ('I', BP.pack_u32_in_int64_little_endian i)
)
else if int64_fits_s32 i then (
Ok ('i', BP.pack_signed_32_little_endian (Int32.of_int64_exn i))
)
else error "Sam integer cannot be encoded in BAM format" i Int64.sexp_of_t
| `H s -> (
let r = ref [] in
String.iter s ~f:(fun c ->
r := sprintf "%02x" (Char.to_int c) :: !r
) ;
Ok ('H',
String.concat ~sep:"" (List.rev !r) ^ "\000")
)
| `Z s ->
Ok ('Z', s ^ "\000")
in
let open Or_error.Monad_infix in
List.map opt_fields ~f:(fun opt_field ->
field_value_encoding opt_field.Sam.value >>= fun (c, s) ->
Ok (sprintf "%s%c%s" opt_field.Sam.tag c s)
)
|> Or_error.all
>>| String.concat ~sep:""
let int32 i ~ub var =
if i < ub then
match Int32.of_int i with
| Some i -> return i
| None -> error_string "invalid conversion to int32"
else
errorf "invalid conversion to int32 (%s than %d)" var ub
let encode_bin_mq_nl ~bin ~mapq ~l_read_name =
let open Int32 in
int32 bin ~ub:65536 "bin" >>= fun bin ->
int32 mapq ~ub:256 "mapq" >>= fun mapq ->
int32 l_read_name ~ub:256 "l_read_name" >>= fun l_read_name ->
return (
bit_or
(shift_left bin 16)
(bit_or (shift_left mapq 8) l_read_name)
)
let encode_flag_nc ~flags ~n_cigar_ops =
let open Int32 in
int32 flags ~ub:65536 "flags" >>= fun flags ->
int32 n_cigar_ops ~ub:65536 "n_cigar_ops" >>= fun n_cigar_ops ->
return (bit_or (shift_left flags 16) n_cigar_ops)
let encode al =
begin match al.Sam.rname with
| Some rname -> find_ref_id header rname
| None -> Ok (-1)
end
>>= fun ref_id ->
let read_name = Option.value ~default:"*" al.Sam.qname in
let seq = Option.value ~default:"*" al.Sam.seq in
let pos = (Option.value ~default:0 al.Sam.pos) - 1 in
let bin = reg2bin pos (pos + String.(length seq)) in
let mapq = Option.value ~default:255 al.Sam.mapq in
let l_read_name = String.length read_name + 1 in
encode_bin_mq_nl ~bin ~mapq ~l_read_name
>>= fun bin_mq_nl ->
let flags = (al.Sam.flags :> int) in
let n_cigar_ops = List.length al.Sam.cigar in
encode_flag_nc ~flags ~n_cigar_ops
>>= fun flag_nc ->
begin match al.Sam.rnext with
| Some `Equal_to_RNAME -> Ok ref_id
| Some (`Value s) -> find_ref_id header s
| None -> Ok (-1)
end
>>= fun next_ref_id ->
let pnext = Option.value ~default:0 al.Sam.pnext - 1 in
let tlen = Option.value ~default:0 al.Sam.tlen in
let cigar = string_of_cigar_ops al.Sam.cigar in
Result.List.map al.Sam.qual ~f:(Phred_score.to_char ~offset:`Offset33)
>>| String.of_char_list
>>= fun qual ->
string_of_optional_fields al.Sam.optional_fields >>= fun optional ->
Ok {
ref_id; pos; bin_mq_nl; flag_nc; cigar;
next_ref_id; pnext; tlen; seq; qual; optional;
read_name
}
end
let magic_string = "BAM\001"
let input_s32_as_int iz =
Int32.to_int_exn (Bgzf.input_s32 iz)
let iz =
try
let magic = Bgzf.input_string iz 4 in
check String.(magic = magic_string) "Incorrect magic string, not a BAM file" >>= fun () ->
let l_text = input_s32_as_int iz in
check (l_text >= 0) "Incorrect size of plain text in BAM header" >>= fun () ->
let text = Bgzf.input_string iz l_text in
Sam.parse_header text
with End_of_file -> error_string "EOF while reading BAM header"
let read_one_reference_information iz =
try
let l_name = input_s32_as_int iz in
check (l_name > 0) "Incorrect encoding of reference sequence name in BAM header" >>= fun () ->
let name = Bgzf.input_string iz (l_name - 1) in
let (_ : char) = Bgzf.input_char iz in
let length = input_s32_as_int iz in
Sam.ref_seq ~name ~length ()
with End_of_file -> error_string "EOF while reading BAM reference information"
let read_reference_information iz =
let rec loop accu n =
if n = 0 then Ok (List.rev accu)
else
match read_one_reference_information iz with
| Ok refseq -> loop (refseq :: accu) (n - 1)
| Error _ as e -> e
in
try
let n_ref = input_s32_as_int iz in
loop [] n_ref
>>| Array.of_list
with End_of_file -> error_string "EOF while reading BAM reference information"
let read_alignment_aux iz block_size =
try
let ref_id = input_s32_as_int iz in
begin match Bgzf.input_s32 iz |> Int32.to_int with
| Some 2147483647
| None -> error_string "A read has a position greater than 2^31"
| Some n ->
if n < -1 then errorf "A read has a negative position %d" n
else return n
end
>>= fun pos ->
let bin_mq_nl = Bgzf.input_s32 iz in
let l_read_name = get_8_0 bin_mq_nl in
checkf (l_read_name > 0) "Alignment with l_read_name = %d" l_read_name >>= fun () ->
let flag_nc = Bgzf.input_s32 iz in
let n_cigar_ops = get_16_0 flag_nc in
let l_seq = input_s32_as_int iz in
check (l_seq >= 0) "Incorrect sequence length in alignment" >>= fun () ->
let next_ref_id = input_s32_as_int iz in
begin match Bgzf.input_s32 iz |> Int32.to_int with
| Some 2147483647
| None -> error_string "A read has a position > than 2^31"
| Some n ->
if n < -1 then errorf "A read has a negative next position %d" n
else return n
end
>>= fun pnext ->
let tlen = input_s32_as_int iz in
let read_name =
let r = Bgzf.input_string iz (l_read_name - 1) in
let (_ : char) = Bgzf.input_char iz in
r
in
let cigar = Bgzf.input_string iz (n_cigar_ops * 4) in
let seq = Bgzf.input_string iz ((l_seq + 1) / 2) in
let qual = Bgzf.input_string iz l_seq in
let remaining = block_size - 32 - l_read_name - 4 * n_cigar_ops - ((l_seq + 1) / 2) - l_seq in
let optional = Bgzf.input_string iz remaining in
return {
Alignment0.ref_id ; read_name ; flag_nc ; pos ; bin_mq_nl ; cigar ;
next_ref_id ; pnext ; tlen ; seq ; qual ; optional
}
with End_of_file -> error_string "EOF while reading alignment"
let read_alignment iz =
try
let block_size = input_s32_as_int iz in
Some (read_alignment_aux iz block_size)
with End_of_file -> None
let read_alignment_stream iz =
Stream.from (fun _ -> read_alignment iz)
let iz =
read_sam_header iz >>= fun ->
read_reference_information iz >>= fun ref_seq ->
Ok { sam_header ; ref_seq }
let read0 ic =
let iz = Bgzf.of_in_channel ic in
read_header iz >>= fun ->
return (header, read_alignment_stream iz)
let with_file0 fn ~f =
In_channel.with_file ~binary:true fn ~f:(fun ic ->
read0 ic >>= fun (, alignments) ->
f header alignments
)
let h oz =
let open Sam in
let buf = Buffer.create 1024 in
let add_line x =
Buffer.add_string buf x ;
Buffer.add_char buf '\n'
in
Option.iter h.version ~f:(fun version ->
let hl = header_line ~version ?sort_order:h.sort_order () |> ok_exn in
add_line (print_header_line hl)
) ;
List.iter h.ref_seqs ~f:(fun x ->
add_line (print_ref_seq x)
) ;
List.iter h.read_groups ~f:(fun x ->
add_line (print_read_group x)
) ;
List.iter h.programs ~f:(fun x ->
add_line (print_program x)
) ;
List.iter h.comments ~f:(fun x ->
Buffer.add_string buf "@CO\t" ;
add_line x
) ;
List.iter h.others ~f:(fun x ->
add_line (print_other x)
) ;
Bgzf.output_s32 oz (Int32.of_int_exn (Buffer.length buf)) ;
Bgzf.output_string oz (Buffer.contents buf)
let output_null_terminated_string oz s =
Bgzf.output_string oz s ;
Bgzf.output_char oz '\000'
let write_reference_sequences h oz =
let open Sam in
Bgzf.output_s32 oz (Int32.of_int_exn (Array.length h.ref_seq)) ;
Array.iter h.ref_seq ~f:(fun rs ->
Bgzf.output_s32 oz (Int32.of_int_exn (String.length rs.name + 1)) ;
output_null_terminated_string oz rs.name ;
Bgzf.output_s32 oz (Int32.of_int_exn rs.length) ;
)
let oz =
Bgzf.output_string oz magic_string ;
write_plain_SAM_header header.sam_header oz ;
write_reference_sequences header oz
let write_alignment oz al =
let open Alignment0 in
Bgzf.output_s32 oz (Int32.of_int_exn (sizeof al)) ;
Bgzf.output_s32 oz (Int32.of_int_exn al.ref_id) ;
Bgzf.output_s32 oz (Int32.of_int_exn al.pos) ;
Bgzf.output_s32 oz al.bin_mq_nl ;
Bgzf.output_s32 oz al.flag_nc ;
Bgzf.output_s32 oz (Int32.of_int_exn (l_seq al)) ;
Bgzf.output_s32 oz (Int32.of_int_exn al.next_ref_id) ;
Bgzf.output_s32 oz (Int32.of_int_exn al.pnext) ;
Bgzf.output_s32 oz (Int32.of_int_exn al.tlen) ;
output_null_terminated_string oz al.read_name ;
Bgzf.output_string oz al.cigar ;
Bgzf.output_string oz al.seq ;
Bgzf.output_string oz al.qual ;
Bgzf.output_string oz al.optional
let write0 alignments oc =
let oz = Bgzf.of_out_channel oc in
write_header header oz ;
Stream.iter alignments ~f:(write_alignment oz) ;
Bgzf.dispose_out oz
let bind f x = Or_error.bind x ~f
let read ic =
read0 ic >>= fun (, xs) ->
Ok (header, Stream.map xs ~f:(bind (fun r -> Alignment0.decode r header)))
let with_file fn ~f = with_file0 fn ~f:(fun xs ->
f header (Stream.map xs ~f:(bind (fun r -> Alignment0.decode r header)))
)
let write h xs oc =
let module M = struct exception E of Error.t end in
let xs = Stream.map xs ~f:(fun al ->
match Alignment0.encode al h with
| Ok r -> r
| Error e -> raise (M.E e)
)
in
try write0 h xs oc ; Ok ()
with M.E e -> Error e