package docteur

  1. Overview
  2. Docs
Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source

Source file analyze.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
let src = Logs.Src.create "docteur.analyze"

module Log = (val Logs.src_log src : Logs.LOG)

module SHA1 = struct
  include Digestif.SHA1

  let feed ctx ?off ?len bs = feed_bigstring ctx ?off ?len bs
  let null = digest_string ""
  let length = digest_size
  let compare a b = String.compare (to_raw_string a) (to_raw_string b)
end

module Lwt_scheduler = struct
  module Mutex = struct
    type 'a fiber = 'a Lwt.t
    type t = Lwt_mutex.t

    let create () = Lwt_mutex.create ()
    let lock t = Lwt_mutex.lock t
    let unlock t = Lwt_mutex.unlock t
  end

  module Condition = struct
    type 'a fiber = 'a Lwt.t
    type mutex = Mutex.t
    type t = unit Lwt_condition.t

    let create () = Lwt_condition.create ()
    let wait t mutex = Lwt_condition.wait ~mutex t
    let signal t = Lwt_condition.signal t ()
    let broadcast t = Lwt_condition.broadcast t ()
  end

  type 'a t = 'a Lwt.t

  let bind x f = Lwt.bind x f
  let return x = Lwt.return x
  let parallel_map ~f lst = Lwt_list.map_p f lst
  let parallel_iter ~f lst = Lwt_list.iter_p f lst

  let detach f =
    let th, wk = Lwt.wait () in
    Lwt.async (fun () ->
        let res = f () in
        Lwt.wakeup_later wk res ;
        Lwt.return_unit) ;
    th
end

let failwith fmt = Fmt.kstr failwith fmt

module Scheduler = Carton.Make (Lwt)
module Verify = Carton.Dec.Verify (SHA1) (Scheduler) (Lwt_scheduler)
module First_pass = Carton.Dec.Fp (SHA1)
open Scheduler

let scheduler =
  let open Lwt in
  let open Scheduler in
  {
    Carton.bind = (fun x f -> inj (bind (prj x) (fun x -> prj (f x))));
    return = (fun x -> inj (return x));
  }

let replace tbl k v =
  match Hashtbl.find_opt tbl k with
  | Some v' -> if v' < v then Hashtbl.replace tbl k v
  | _ -> Hashtbl.add tbl k v

let digest ~kind ?(off = 0) ?len buf =
  let len =
    match len with Some len -> len | None -> Bigstringaf.length buf - off in
  let ctx = SHA1.empty in
  let ctx =
    match kind with
    | `A -> SHA1.feed_string ctx (Fmt.str "commit %d\000" len)
    | `B -> SHA1.feed_string ctx (Fmt.str "tree %d\000" len)
    | `C -> SHA1.feed_string ctx (Fmt.str "blob %d\000" len)
    | `D -> SHA1.feed_string ctx (Fmt.str "tag %d\000" len) in
  let ctx = SHA1.feed_bigstring ctx ~off ~len buf in
  SHA1.get ctx

let first_pass ~read ~block_size ~get_block fd =
  Log.debug (fun m -> m "Start to analyze the given PACK file.") ;
  let ( >>= ) = scheduler.bind in
  let return = scheduler.return in

  let oc = De.bigstring_create De.io_buffer_size in
  let zw = De.make_window ~bits:15 in
  let tp = ref (Bigstringaf.create (Int64.to_int block_size)) in
  let allocate _ = zw in
  First_pass.check_header scheduler read fd >>= fun (max, _, _) ->
  let decoder = First_pass.decoder ~o:oc ~allocate `Manual in
  let children = Hashtbl.create 0x100 in
  let where = Hashtbl.create 0x100 in
  let weight = Hashtbl.create 0x100 in
  let length = Hashtbl.create 0x100 in
  let carbon = Hashtbl.create 0x100 in
  let matrix = Array.make max Verify.unresolved_node in
  let sector = ref 1 in

  let rec go decoder =
    match First_pass.decode decoder with
    | `Await decoder -> (
        Log.debug (fun m -> m "`Await") ;
        let offset = Int64.mul (Int64.of_int !sector) block_size in
        match get_block fd offset !tp 0 (Int64.to_int block_size) with
        | Ok () ->
            incr sector ;
            go (First_pass.src decoder !tp 0 (Int64.to_int block_size))
        | _ -> failwith "Block: analyze(): Cannot read ~sector:%d" !sector)
    | `Peek decoder -> (
        Log.debug (fun m -> m "`Peek") ;
        let offset = Int64.mul (Int64.of_int !sector) block_size in
        let keep = First_pass.src_rem decoder in
        let tp' = Bigstringaf.create (keep + Int64.to_int block_size) in
        Bigstringaf.blit !tp ~src_off:0 tp' ~dst_off:0 ~len:keep ;
        match get_block fd offset tp' keep (Int64.to_int block_size) with
        | Ok () ->
            incr sector ;
            tp := tp' ;
            go (First_pass.src decoder tp' 0 (keep + Int64.to_int block_size))
        | _ -> failwith "Block: analyze(): Cannot read ~sector:%d" !sector)
    | `Entry ({ First_pass.kind = Base _; offset; size; consumed; _ }, decoder)
      ->
        Log.debug (fun m -> m "[+] base object") ;
        let offset = Int64.add offset (Int64.of_int (SHA1.length + 8)) in
        let n = First_pass.count decoder - 1 in
        Hashtbl.add weight offset size ;
        Hashtbl.add length offset size ;
        Hashtbl.add carbon offset consumed ;
        Hashtbl.add where offset n ;
        matrix.(n) <- Verify.unresolved_base ~cursor:offset ;
        go decoder
    | `Entry
        ( {
            First_pass.kind = Ofs { sub = s; source; target };
            offset;
            size;
            consumed;
            _;
          },
          decoder ) ->
        Log.debug (fun m -> m "[+] ofs object") ;
        let offset = Int64.add offset (Int64.of_int (SHA1.length + 8)) in
        let n = First_pass.count decoder - 1 in
        replace weight Int64.(sub offset (Int64.of_int s)) source ;
        replace weight offset target ;
        Hashtbl.add length offset size ;
        Hashtbl.add carbon offset consumed ;
        Hashtbl.add where offset n ;
        (try
           let vs = Hashtbl.find children (`Ofs Int64.(sub offset (of_int s))) in
           Hashtbl.replace children
             (`Ofs Int64.(sub offset (of_int s)))
             (offset :: vs)
         with _ ->
           Hashtbl.add children (`Ofs Int64.(sub offset (of_int s))) [ offset ]) ;
        go decoder
    | `Entry
        ( {
            First_pass.kind = Ref { ptr; target; source };
            offset;
            size;
            consumed;
            _;
          },
          decoder ) ->
        Log.debug (fun m -> m "[+] ref object") ;
        let offset = Int64.add offset (Int64.of_int (SHA1.length + 8)) in
        let n = First_pass.count decoder - 1 in
        replace weight offset (Stdlib.max target source) ;
        Hashtbl.add length offset size ;
        Hashtbl.add carbon offset consumed ;
        Hashtbl.add where offset n ;
        (try
           let vs = Hashtbl.find children (`Ref ptr) in
           Hashtbl.replace children (`Ref ptr) (offset :: vs)
         with _ -> Hashtbl.add children (`Ref ptr) [ offset ]) ;
        go decoder
    | `End _hash ->
        let where ~cursor = Hashtbl.find where cursor in
        let children ~cursor ~uid =
          match
            ( Hashtbl.find_opt children (`Ofs cursor),
              Hashtbl.find_opt children (`Ref uid) )
          with
          | Some a, Some b -> List.sort_uniq compare (a @ b)
          | Some x, None | None, Some x -> x
          | None, None -> [] in
        let weight ~cursor = Hashtbl.find weight cursor in
        let oracle = { Carton.Dec.where; children; digest; weight } in
        return (matrix, oracle)
    | `Malformed err -> failwith "Block: analyze(): %s" err in
  match get_block fd 0L !tp 0 (Int64.to_int block_size) with
  | Ok () ->
      let decoder =
        First_pass.src decoder !tp (SHA1.length + 8)
          (Int64.to_int block_size - SHA1.length - 8) in
      go decoder
  | _ -> failwith "Block: analyze(): Cannot read ~sector:%d" 0

let map fd ~block_size ~get_block ~pos len =
  assert (len <= Int64.to_int block_size) ;
  assert (Int64.logand pos (Int64.pred block_size) = 0L) ;
  let len = Int64.to_int block_size in
  let res = Bigstringaf.create len in
  match get_block fd pos res 0 len with Ok () -> res | Error _ -> assert false

type a_and_b = [ `A | `B ]

module Commit = Git.Commit.Make (Git.Hash.Make (SHA1))
module Tree = Git.Tree.Make (Git.Hash.Make (SHA1))

let load ~block_size ~get_block pack uid =
  let open Rresult in
  let map = map ~block_size ~get_block in
  let path = Carton.Dec.path_of_uid ~map pack uid in
  match Carton.Dec.kind_of_path path with
  | `C -> R.ok `Blob
  | `D -> R.ok `Tag
  | #a_and_b as kind -> (
      let cursor = List.hd (Carton.Dec.path_to_list path) in
      let weight =
        Carton.Dec.weight_of_offset ~map pack ~weight:Carton.Dec.null cursor
      in
      let raw = Carton.Dec.make_raw ~weight in
      let v = Carton.Dec.of_offset_with_path ~map pack ~path raw ~cursor in
      match kind with
      | `A ->
          let parser = Encore.to_angstrom Commit.format in
          Angstrom.parse_bigstring ~consume:All parser
            (Bigstringaf.sub (Carton.Dec.raw v) ~off:0 ~len:(Carton.Dec.len v))
          |> R.reword_error (fun _ -> R.msgf "Invalid commit (%a)" SHA1.pp uid)
          >>| fun v -> `Commit v
      | `B ->
          let parser = Encore.to_angstrom Tree.format in
          Angstrom.parse_bigstring ~consume:All parser
            (Bigstringaf.sub (Carton.Dec.raw v) ~off:0 ~len:(Carton.Dec.len v))
          |> R.reword_error (fun _ -> R.msgf "Invalid tree (%a)" SHA1.pp uid)
          >>| fun v -> `Tree v)

let rec fold ~block_size ~get_block pack directories files path hash =
  let open Rresult in
  load ~block_size ~get_block pack hash >>= function
  | `Tree tree ->
      let f a { Git.Tree.name; node; perm } =
        match (a, perm) with
        | (Error _ as err), _ -> err
        | Ok _, `Dir ->
            let path = Fpath.(path / name) in
            Art.insert directories (Art.key (Fpath.to_string path)) node ;
            fold ~block_size ~get_block pack directories files path node
        | Ok _, (`Everybody | `Normal) ->
            let path = Fpath.(path / name) in
            Art.insert files (Art.key (Fpath.to_string path)) node ;
            R.ok ()
        | (Ok _ as v), _ -> v in
      List.fold_left f (R.ok ()) (Git.Tree.to_list tree)
  | `Commit commit ->
      fold ~block_size ~get_block pack directories files path
        (Commit.tree commit)
  | `Blob | `Tag -> R.ok ()

type 'fd buffers = {
  z : Bigstringaf.t;
  allocate : int -> De.window;
  w : 'fd Carton.Dec.W.t;
}

let unpack fd ~read ~block_size ~get_block commit =
  let open Lwt.Infix in
  let map = map ~block_size ~get_block in
  Log.debug (fun m -> m "Start to analyze the PACK file.") ;
  first_pass ~read ~block_size ~get_block fd |> prj >>= fun (matrix, oracle) ->
  let z = De.bigstring_create De.io_buffer_size in
  let allocate bits = De.make_window ~bits in
  let never _ = assert false in
  let pack =
    Carton.Dec.make ~sector:block_size fd ~allocate ~z ~uid_ln:SHA1.length
      ~uid_rw:SHA1.of_raw_string never in
  Verify.verify ~threads:1 pack ~map ~oracle ~verbose:ignore ~matrix
  >>= fun () ->
  match Array.for_all Verify.is_resolved matrix with
  | false -> Lwt.return_error `Invalid_store
  | true -> (
      let index = Hashtbl.create (Array.length matrix) in
      let iter v =
        let offset = Verify.offset_of_status v in
        let hash = Verify.uid_of_status v in
        Hashtbl.add index hash offset in
      Array.iter iter matrix ;
      let pack =
        Carton.Dec.make ~sector:block_size fd ~allocate ~z ~uid_ln:SHA1.length
          ~uid_rw:SHA1.of_raw_string (Hashtbl.find index) in
      let directories = Art.make () in
      let files = Art.make () in
      match
        fold ~block_size ~get_block pack directories files (Fpath.v "/") commit
      with
      | Ok () ->
          let buffers =
            Lwt_pool.create 4 @@ fun () ->
            let z = Bigstringaf.create De.io_buffer_size in
            let w = De.make_window ~bits:15 in
            let allocate _ = w in
            let w = Carton.Dec.W.make ~sector:block_size fd in
            Lwt.return { z; allocate; w } in
          Lwt.return_ok (buffers, pack, directories, files)
      | Error _ as err -> Lwt.return err)

let read fd ~get_block offset bs =
  let rec go offset = function
    | [] -> ()
    | x :: r ->
    match get_block fd offset x 0 (Bigstringaf.length x) with
    | Ok () -> go (Int64.add offset (Int64.of_int (Bigstringaf.length x))) r
    | Error _ -> failwith "Block: iter(): Cannot read at %Ld" offset in
  go offset bs

let rec split ~block_size index off acc =
  if off = Bigstringaf.length index
  then List.rev acc
  else
    let block = Bigstringaf.sub index ~off ~len:(Int64.to_int block_size) in
    split ~block_size index (off + Int64.to_int block_size) (block :: acc)

let iter fd ~block_size ~capacity ~get_block commit cursor =
  let index = Bigstringaf.create (Int64.to_int (Int64.sub capacity cursor)) in
  let blocks = split ~block_size index 0 [] in
  read fd ~get_block cursor blocks ;
  let index =
    Carton.Dec.Idx.make index ~uid_ln:SHA1.digest_size
      ~uid_rw:SHA1.to_raw_string ~uid_wr:SHA1.of_raw_string in
  let z = Bigstringaf.create De.io_buffer_size in
  let zw = De.make_window ~bits:15 in
  let allocate _ = zw in
  let find uid =
    match Carton.Dec.Idx.find index uid with
    | Some (_, offset) -> Int64.add (Int64.of_int (SHA1.digest_size + 8)) offset
    | None -> failwith "%a does not exist" SHA1.pp uid in
  let pack =
    Carton.Dec.make ~sector:block_size fd ~allocate ~z ~uid_ln:SHA1.length
      ~uid_rw:SHA1.of_raw_string find in
  let directories = Art.make () in
  let files = Art.make () in
  match
    fold ~block_size ~get_block pack directories files (Fpath.v "/") commit
  with
  | Ok () ->
      let buffers =
        Lwt_pool.create 4 @@ fun () ->
        let z = Bigstringaf.create De.io_buffer_size in
        let w = De.make_window ~bits:15 in
        let allocate _ = w in
        let w = Carton.Dec.W.make ~sector:block_size fd in
        Lwt.return { z; allocate; w } in
      Lwt.return_ok (buffers, pack, directories, files)
  | Error _ as err -> Lwt.return err
OCaml

Innovation. Community. Security.