package biocaml

  1. Overview
  2. Docs
Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source

Source file bgzf.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
(***********************************************************************)
(*                                                                     *)
(*                         The CamlZip library                         *)
(*                                                                     *)
(*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
(*                                                                     *)
(*  Copyright 2001 Institut National de Recherche en Informatique et   *)
(*  en Automatique.  All rights reserved.  This file is distributed    *)
(*  under the terms of the GNU Library General Public License, with    *)
(*  the special exception on linking described in file LICENSE.        *)
(*                                                                     *)
(***********************************************************************)

(* Various parts of this module, as well as its global structure are
   adapted from the gzip module in the camlzip library. *)

let max_block_size = 0x10000
let max_isize = 0xff00

(* Justification for the above constants *)
(* let compressBound = *)
(*   let open Ctypes in *)
(*   let open Foreign in *)
(*   foreign "compressBound" (int @-> returning int) *)

(* # compressBound 0xff00;; *)
(* - : int = 65311 *)
(* The size of the header is 16, footer is 8, so data + header + footer < 0x10000 *)

exception Error of string

type in_channel = {
  ic : Stdlib.in_channel ; (* Underlying channel *)
  in_bufz : bytes ; (* Compressed block *)
  in_buf : bytes ;   (* Uncompressed block *)
  mutable in_block_offset : Int64.t ; (* Offset of the current block *)
  mutable in_pos : int ; (* Position in the current block *)
  mutable in_avail : int ; (* Number of available characters in the current block, can be less than [max_block_size] *)
  mutable in_eof : bool ; (* Flag indicating we reached the end of the file *)
  mutable in_stream : Zlib.stream;
}

let of_in_channel ic = {
  ic ;
  in_bufz = Bytes.make max_block_size '\000' ;
  in_buf = Bytes.make max_block_size '\000' ;
  in_block_offset = Int64.zero ;
  in_pos = 0 ;
  in_avail = 0 ;
  in_stream = Zlib.inflate_init false ;
  in_eof = false
}

let open_in fn = of_in_channel (Stdlib.open_in_bin fn)

let dispose_in iz =
  iz.in_eof <- true ;
  Zlib.inflate_end iz.in_stream

let close_in iz =
  dispose_in iz ;
  In_channel.close iz.ic

let input_byte t = Caml.input_byte t

let input_u16 ic =
  let b1 = input_byte ic in
  let b2 = input_byte ic in
  b1 + b2 lsl 8

let input_s32 ic =
  let b1 = input_byte ic in
  let b2 = input_byte ic in
  let b3 = input_byte ic in
  let b4 = input_byte ic in
  let open Int32 in
  bit_or (of_int_exn b1)
    (bit_or (shift_left (of_int_exn b2) 8)
       (bit_or (shift_left (of_int_exn b3) 16)
          (shift_left (of_int_exn b4) 24)))

(* Raises End_of_file iff there is no more block to read *)
let read_header iz =
  match In_channel.input_byte iz.ic with
  | None -> iz.in_eof <- true ; raise End_of_file
  | Some id1 ->
    try
      let id2 = input_byte iz.ic in
      if id1 <> 0x1F || id2 <> 0x8B then raise (Error "bad magic number, not a bgzf file") ;
      let cm = input_byte iz.ic in
      if cm <> 8 then raise(Error "unknown compression method") ;
      let flags = input_byte iz.ic in
      if flags <> 0x04 then raise(Error("bad flags, not a bgzf file"));
      for _ = 1 to 6 do ignore (input_byte iz.ic : int) done;
      let xlen = input_u16 iz.ic in
      let si1 = input_byte iz.ic in
      let si2 = input_byte iz.ic in
      let slen = input_u16 iz.ic in
      if si1 <> 66 || si2 <> 67 || slen <> 2 then raise (Error "bad extra subfield") ;
      let bsize = input_u16 iz.ic in
      for _ = 1 to xlen - 6 do ignore (input_byte iz.ic : int) done ;
      bsize - xlen - 19
    with End_of_file -> raise (Error "premature end of file, not a bgzf file")

let read_block iz =
  let rec loop posz lenz pos len crc size =
    let (finished, used_in, used_out) =
      try
        Zlib.inflate iz.in_stream iz.in_bufz posz lenz iz.in_buf pos len Zlib.Z_SYNC_FLUSH
      with Zlib.Error (_, _) ->
        raise (Error "error during decompression") in
    let posz = posz + used_in in
    let lenz = lenz - used_in in
    let crc = Zlib.update_crc crc iz.in_buf pos used_out in
    let size = size + used_out in
    if finished then crc, size
    else loop posz lenz (pos + used_out) (len - used_out) crc size
  in
  try
    iz.in_block_offset <- In_channel.pos iz.ic ;
    let cdata_size = read_header iz in (* read_header raises End_of_file iff there is no more block to read *)
    try
      Stdlib.really_input iz.ic iz.in_bufz 0 cdata_size ;
      let ref_crc = input_s32 iz.ic in
      let ref_size = input_s32 iz.ic |> Int32.to_int_exn in
      Zlib.inflate_end iz.in_stream ;
      iz.in_stream <- Zlib.inflate_init false ;
      let crc, size = loop 0 cdata_size 0 max_block_size Int32.zero 0 in
      if Int32.(crc <> ref_crc) then raise (Error "CRC mismatch, data corrupted") ;
      if size <> ref_size then raise (Error "size mismatch, data corrupted") ;
      iz.in_pos <- 0 ;
      iz.in_avail <- size
    with End_of_file -> raise (Error "premature end of file, not a bgzf file")
  with End_of_file -> iz.in_eof <- true


let input iz buf pos len =
  let n = Bytes.length buf in
  if pos < 0 || len < 0 || pos + len > n then raise (Invalid_argument "Bgzf.input") ;
  if iz.in_eof then 0
  else (
    let rec loop pos len read =
      if len = 0 then read
      else (
        if iz.in_pos = iz.in_avail then read_block iz ;
        if iz.in_eof then read
        else (
          let n = min (iz.in_avail - iz.in_pos) len in
          Stdlib.Bytes.blit iz.in_buf iz.in_pos buf pos n ;
          iz.in_pos <- iz.in_pos + n ;
          loop (pos + n) (len - n) (read + n)
        )
      )
    in
    loop pos len 0
  )

let rec really_input iz buf pos len =
  if len <= 0 then ()
  else (
    let n = input iz buf pos len in
    if n = 0 then raise End_of_file
    else really_input iz buf (pos + n) (len - n)
  )

let input_string iz n =
  if n < 0 then raise (Invalid_argument "Bgzf.input_string iz n: n should be non negative") ;
  let r = Bytes.make n '@' in
  really_input iz r 0 n ;
  Bytes.unsafe_to_string ~no_mutation_while_string_reachable:r

let input_char =
  let buf = Bytes.create 1 in
  fun iz ->
    if input iz buf 0 1 = 0 then raise End_of_file
    else Bytes.get buf 0

let input_u8 iz =
  Char.to_int (input_char iz)

(* input_s* functions adapted from Batteries BatIO module *)
let input_s8 iz =
  let b = input_u8 iz in
  if b land 128 <> 0 then b - 256
  else b

let input_u16 iz =
  let b1 = input_u8 iz in
  let b2 = input_u8 iz in
  b1 lor (b2 lsl 8)

let input_s16 iz =
  let i = input_u16 iz in
  if i land 32768 <> 0 then i - 65536
  else i

let input_s32 iz =
  let b1 = input_u8 iz in
  let b2 = input_u8 iz in
  let b3 = input_u8 iz in
  let b4 = input_u8 iz in
  Int32.bit_or (Int32.of_int_exn b1)
    (Int32.bit_or (Int32.shift_left (Int32.of_int_exn b2) 8)
      (Int32.bit_or (Int32.shift_left (Int32.of_int_exn b3) 16)
                   (Int32.shift_left (Int32.of_int_exn b4) 24)))

let seek_in iz i =
  let coffset = Int64.shift_right i 16 in
  let uoffset = Int64.(to_int_exn (bit_and 0xFFFFL i)) in
  In_channel.seek iz.ic coffset ;
  iz.in_block_offset <- coffset ;
  iz.in_eof <- false ;
  if uoffset = 0 then (
    iz.in_pos <- 0 ;
    iz.in_avail <- 0
  )
  else (
    read_block iz ;
    iz.in_pos <- iz.in_pos + uoffset
  )

let virtual_offset iz =
  if iz.in_pos = iz.in_avail then
    Int64.(shift_left (In_channel.pos iz.ic) 16)
  else
    Int64.(shift_left iz.in_block_offset 16 + of_int_exn iz.in_pos)

let with_file_in fn ~f =
  let iz = open_in fn in
  let r =
    try `Ok (f iz)
    with e -> `Error e
  in
  close_in iz ;
  match r with
  | `Ok y -> y
  | `Error exn -> raise exn

exception Unparser_error of string

type out_channel = {
  out_chan : Stdlib.out_channel ;
  out_ubuffer : bytes ;
  out_cbuffer : bytes ;
  mutable out_pos : int ; (* position in out_ubuffer *)
  out_level : int ;
}

let output_int16 oc n =
  Out_channel.output_byte oc n ;
  Out_channel.output_byte oc (n lsr 8)

let output_int32 oc n =
  let r = ref n in
  for _ = 1 to 4 do
    Out_channel.output_byte oc (Int32.to_int_exn !r);
    r := Int32.shift_right_logical !r 8
  done

let write_block oc buf len ~isize ~crc32 =
  let xlen = 6 in
  let bsize = 20 + xlen + len in
  assert (bsize < 0x10000) ;
  Out_channel.output_byte oc 0x1F;                  (* ID1 *)
  Out_channel.output_byte oc 0x8B;                  (* ID2 *)
  Out_channel.output_byte oc 8;                     (* compression method *)
  Out_channel.output_byte oc 4;                     (* flags *)
  for _ = 1 to 4 do
    Out_channel.output_byte oc 0                    (* mtime *)
  done ;
  Out_channel.output_byte oc 0;                     (* xflags *)
  Out_channel.output_byte oc 0xFF;                  (* OS (unknown) *)
  output_int16 oc xlen ;                (* XLEN *)
  Out_channel.output_byte oc 0x42 ;                 (* SI1 *)
  Out_channel.output_byte oc 0x43 ;                 (* SI2 *)
  output_int16 oc 2 ;                   (* SLEN *)
  output_int16 oc (bsize - 1);          (* BSIZE - 1*)
  Caml.output oc buf 0 len ;                 (* DATA *)
  output_int32 oc crc32 ;               (* CRC32 *)
  output_int32 oc isize                 (* ISIZE *)

let of_out_channel ?(level = 6) oc =
  if level < 1 || level > 9 then raise (invalid_arg "Bgzf: bad compression level") ;
  {
    out_chan = oc;
    out_ubuffer = Bytes.create max_isize ;
    out_cbuffer = Bytes.create max_block_size ;
    out_pos = 0 ;
    out_level = level ;
  }

let open_out ?(level = 6) filename =
  of_out_channel ~level (Stdlib.open_out_bin filename)


let push_block oz =
  let stream = Zlib.deflate_init oz.out_level false in
  let (_, used_in, used_out) =
    try
      Zlib.deflate
        stream
        oz.out_ubuffer 0 oz.out_pos
        oz.out_cbuffer 0 (Bytes.length oz.out_cbuffer)
        Zlib.Z_FINISH
    with Zlib.Error(_, _) -> raise (Unparser_error("error during compression"))
  in
  assert (used_in = oz.out_pos) ;
  let crc32 = Zlib.update_crc Int32.zero oz.out_ubuffer 0 used_in in
  Zlib.deflate_end stream ;
  write_block oz.out_chan oz.out_cbuffer used_out
    ~isize:(Int32.of_int_exn used_in) ~crc32 ;
  oz.out_pos <- 0

let rec output ~length ~blit oz buf ~pos ~len =
  if pos < 0 || len < 0 || pos + len > length buf then invalid_arg "Bgzf.output";
  (* If output buffer is full, flush it *)
  if oz.out_pos = Bytes.length oz.out_ubuffer then push_block oz ;
  let available = Bytes.length oz.out_ubuffer - oz.out_pos in
  let ncopy = min len available in
  blit buf pos oz.out_ubuffer oz.out_pos ncopy ;
  oz.out_pos <- oz.out_pos + ncopy ;
  let remaining = len - ncopy in
  if remaining > 0 then output ~length ~blit oz buf ~pos:(pos + ncopy) ~len:remaining

let output_from_string = output ~length:String.length ~blit:Caml.Bytes.blit_string
let output = output ~length:Bytes.length ~blit:Caml.Bytes.blit

let output_char =
  let buf = Bytes.make 1 ' ' in
  fun oz c ->
    Bytes.set buf 0 c ;
    output oz buf ~pos:0 ~len:1

(* output_* functions adapted from Batteries BatIO module *)
let output_u8 oz n =
  (* if n < 0 || n > 0xFF then raise (Invalid_argument "Bgzf.output_u8") ; *)
  output_char oz (Char.unsafe_of_int (n land 0xFF))

let output_s8 oz n =
  if n < -0x80 || n > 0x7F then raise (Invalid_argument "Bgzf.output_s8") ;
  if n < 0 then
    output_u8 oz (n + 256)
  else
    output_u8 oz n

let output_u16 oz n =
  output_u8 oz n ;
  output_u8 oz (n lsr 8)

let output_s16 oz n =
  if n < -0x8000 || n > 0x7FFF then raise (Invalid_argument "Bgzf.output_s16") ;
  if n < 0 then
    output_u16 oz (65536 + n)
  else
    output_u16 oz n

let output_s32 oz n =
  let base = Int32.to_int_exn n in
  let big = Int32.to_int_exn (Int32.shift_right_logical n 24) in
  output_u8 oz base ;
  output_u8 oz (base lsr 8) ;
  output_u8 oz (base lsr 16) ;
  output_u8 oz big

let output_string oz s =
  output_from_string oz s ~pos:0 ~len:(String.length s)

let bgzf_eof = "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00BC\x02\x00\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00"

let dispose_out oz =
  if oz.out_pos > 0 then push_block oz ;
  Stdlib.output_string oz.out_chan bgzf_eof

let close_out oz =
  dispose_out oz ;
  Stdlib.close_out oz.out_chan

let with_file_out ?level fn ~f =
  let oz = open_out ?level fn in
  let r =
    try `Ok (f oz)
    with e -> `Error e
  in
  close_out oz ;
  match r with
  | `Ok y -> y
  | `Error exn -> raise exn
OCaml

Innovation. Community. Security.