package biocaml

  1. Overview
  2. Docs
Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source

Source file pwm.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92

open CFStream

type count_matrix = int array array
type background = float array
type t = float array array

let int_of_char = function
  | 'a' | 'A' -> 0
  | 'c' | 'C' -> 1
  | 'g' | 'G' -> 2
  | 't' | 'T' -> 3
  | _ -> 4

let flat_background () = Caml.Array.make 4 0.25

let background_of_sequence seq pc =
  let counts = Caml.Array.make 4 0
  and n = ref 0 in
  for i = 0 to String.length seq - 1 do
    let k = int_of_char seq.[i] in
    if k < 4 then (
      counts.(k) <- counts.(k) + 1 ;
      incr n
    )
  done ;
  Array.map ~f:(fun c -> (float c +. pc) /. (float !n +. 4. *. pc)) counts

let reverse_complement a =
  let open Array in
  let n = length a in
  init n ~f:(fun i ->
      let r = copy a.(n - 1 - i) in
      swap r 0 3 ;
      swap r 1 2 ;
      r
    )


let make mat bg =
  let open Array in
  map mat ~f:(fun p ->
      let n = fold ~f:( + ) ~init:0 p in
      let r =
        mapi p ~f:(fun i x ->
            log ((float x +. bg.(i)) /. float n /. bg.(i))
          )
      in
      let n_case =
        Stream.Infix.(0 --^ (Array.length p))
        |> Stream.fold ~f:(fun accu i -> accu +. bg.(i) *. r.(i)) ~init:0. in
      append r [| n_case |])

let tandem ?(orientation = `direct) ~spacer mat1 mat2 bg =
  Array.concat [
    (if Poly.(orientation = `everted) then reverse_complement else Fun.id) (make mat1 bg) ;
    Array.init spacer ~f:(fun _ -> Caml.Array.make 5 0.) ;
    (if Poly.(orientation = `inverted) then reverse_complement else Fun.id) (make mat2 bg)
  ]


let gen_scan f init mat seq tol =
  let r = ref init
  and n = String.length seq
  and m = Array.length mat in
  let seq = Array.init n ~f:(fun i -> int_of_char seq.[i]) in
  for i = n - m downto 0 do
    let score = ref 0. in
    for j = 0 to m - 1 do
      score := !score +. Array.(unsafe_get (unsafe_get mat j) (unsafe_get seq (i + j)))
    done ;
    if Float.(!score > tol)
    then r := f i !score !r
  done ;
  !r

let scan = gen_scan (fun pos score l -> (pos, score) :: l) []

let best_hit mat seq =
  let (pos, _) as r =
    gen_scan (fun p1 s1 ((_, s2) as r2) -> if Float.(s1 > s2) then (p1, s1) else r2) (-1, Float.neg_infinity) mat seq Float.neg_infinity
  in
  if pos < 0 then raise (Invalid_argument "Pwm.best_hit: sequence shorter than the matrix")
  else r


external stub_fast_scan : t -> int array -> float -> (int * float) list = "biocaml_pwm_scan"

let fast_scan mat seq tol =
  let n = String.length seq in
  let seq = Array.init n ~f:(fun i -> int_of_char seq.[i]) in
  stub_fast_scan mat seq tol
OCaml

Innovation. Community. Security.