Source file jaspar.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
open CFStream
let (/) = Filename.concat
type collection = Core | Phylofacts | CNE | PBM | PBM_HOMEO | PBM_HLH | FAM | SPLICE | POLII
type motif = {
id : string ;
jaspar_id : string ;
collection : collection ;
factor_name : string ;
factor_class : string ;
family : string option ;
comment : string option ;
medline : string ;
matrix : int array array ;
}
let collection_of_string = function
| "CNE" -> CNE
| "FAM" -> FAM
| "PHYLOFACTS" -> Phylofacts
| "CORE" -> Core
| "PBM" -> PBM
| "PBM_HOMEO" -> PBM_HOMEO
| "PBM_HLH" -> PBM_HLH
| "SPLICE" -> SPLICE
| "POLII" -> POLII
| s -> failwithf "Jaspa.collection_of_string: unknown collection %s" s ()
let fold_data_file name ~init ~f =
let add_item accu l =
let fields = String.split ~on:'\t' (l : Line.t :> string) in
f accu fields
in
In_channel.with_file name ~f:(fun ic ->
Stream.fold
(Lines.of_channel ic)
~init
~f:add_item
)
let load_matrix fn =
fold_data_file (fn / "MATRIX.txt") ~init:String.Map.empty ~f:(
fun accu -> function
| [ db_id ; collection ; jaspar_id ; _ ; factor_name ] ->
String.Map.set accu ~key:db_id ~data:(object
method collection = collection_of_string collection
method jaspar_id = jaspar_id
method factor_name = factor_name
end)
| _ -> assert false
)
let load_matrix_data fn =
let parse = function
| [ id ; base ; col ; count ] ->
let col = int_of_string col in
object method id = id method base = base method col = col method count = count end
| _ -> assert false
in
let vector_of_lines l =
List.sort ~compare:(fun l1 l2 -> String.compare l1#base l2#base) l
|> List.map ~f:(fun l -> Int.of_float (Float.of_string l#count))
|> Array.of_list
in
let matrix_of_lines l =
let id = (List.hd_exn l)#id in
let matrix =
List.sort l ~compare:(fun x y -> compare x#col y#col)
|> List.group ~break:(fun l1 l2 -> l1#col <> l2#col)
|> List.map ~f:vector_of_lines
|> Array.of_list
in
id, matrix
in
let data = In_channel.with_file (fn / "MATRIX_DATA.txt") ~f:(fun ic ->
Lines.of_channel ic
|> Stream.skip ~n:1
|> Stream.map ~f:(Line.split ~on:'\t')
|> Stream.to_list
|> List.sort ~compare:(fun x y -> compare (List.hd x) (List.hd y))
|> List.group ~break:(fun x y -> List.hd x<> List.hd y)
|> List.map ~f:(List.map ~f:parse)
|> List.map ~f:matrix_of_lines
)
in
String.Map.of_alist_exn data
module SS = struct
include Tuple.Make(String)(String)
include Tuple.Comparable(String)(String)
end
module SSM = Map.Make(SS)
let load_annotation fn =
fold_data_file (fn / "MATRIX_ANNOTATION.txt") ~init:SSM.empty ~f:(fun accu ->
function
| id :: field :: data :: _ -> SSM.set accu ~key:(id, field) ~data
| _ -> assert false
)
let load fn =
let matrix = load_matrix fn in
let matrix_data = load_matrix_data fn in
let annotations = load_annotation fn in
let res = String.Map.mapi matrix ~f:(fun ~key ~data -> {
id = key ;
jaspar_id = data#jaspar_id ;
collection = data#collection ;
factor_name = data#factor_name ;
factor_class = SSM.find_exn annotations (key, "class") ;
comment = (
match SSM.find annotations (key, "comment") with
| Some "-" -> None
| x -> x
) ;
family = SSM.find annotations (key, "family") ;
medline = SSM.find_exn annotations (key, "medline") ;
matrix = String.Map.find_exn matrix_data key ;
})
in
String.Map.data res