Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source
Source file gff.ml
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251(* https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md *)typerecord={seqname:string;source:stringoption;feature:stringoption;start_pos:int;stop_pos:int;score:floatoption;strand:[`Plus|`Minus|`Not_stranded|`Unknown];phase:intoption;attributes:(string*stringlist)list;}typeitem=[`Commentofstring|`Recordofrecord]letrecord?source?feature?score?(strand=`Unknown)?phase?(attributes=[])seqnamestart_posstop_pos={seqname;source;feature;start_pos;stop_pos;score;strand;phase;attributes;}letfails=Error(`Msgs)letfailffmt=Printf.ksprintffailfmtletparse_floats=tryOk(Float.of_strings)withFailures->failsletparse_ints=tryOk(Int.of_strings)withFailures->failsletparse_optf=function|"."->None|s->Some(fs)letparse_opt'f=function|"."->OkNone|s->Result.(fs>>|Option.some)letparse_strand=function|"."->Ok`Not_stranded|"?"->Ok`Unknown|"+"->Ok`Plus|"-"->Ok`Minus|_->Error(`Msg"Incorrect strand character")letparse_tagposbuf=matchString.index_frombufpos'='with|None->fail"Tag without a value"|Somek->Ok(k+1,String.subbuf~pos~len:(k-pos))let%test"Gff.parse_tag"=Caml.(parse_tag0"gene_id=foo"=Ok(8,"gene_id"))letlfind_mapi?(pos=0)s~f=letn=String.lengthsinletrecloopi=ifi<nthenmatchfis.[i]with|None->loop(i+1)|Somey->SomeyelseNoneinloopposletrecparse_value_listposbufacc=letcomma_or_semi_coloni=function|','->Some(i,`Comma)|';'->Some(i,`Semi_colon)|_->Noneinmatchlfind_mapi~posbuf~f:comma_or_semi_colonwith|None->letn=String.lengthbufinletvalue=String.subbuf~pos~len:(n-pos)inn,List.rev(value::acc)|Some(k,`Comma)->letvalue=String.subbuf~pos~len:(k-pos)inparse_value_list(k+1)buf(value::acc)|Some(k,`Semi_colon)->letvalue=String.subbuf~pos~len:(k-pos)ink+1,List.rev(value::acc)lettest_parse_valuebufy=letn,x=parse_value_list0buf[]inCaml.((n,x)=(String.lengthbuf,y))let%test"parse_value_list1"=test_parse_value"region_id=chr1:3008683-3009183"["region_id=chr1:3008683-3009183"]let%test"parse_value_list2"=test_parse_value"3,4"["3";"4"]letrecparse_gff3_attributesposbufacc=letopenResultinifpos>=String.lengthbufthenOk(List.revacc)elseparse_tagposbuf>>=fun(pos,tag)->letpos,values=parse_value_listposbuf[]inletacc=(tag,values)::accinparse_gff3_attributesposbufaccletparse_gff3_attributesbuf=parse_gff3_attributes0buf[]let%test"Gff.parse_gff3_attributes"=Caml.(parse_gff3_attributes"a=2,3;b=4"=Ok[("a",["2";"3"]);("b",["4"])])letparse_gtf_attributesbuf=letopenResult.Monad_infixinletrectokenizeaccp=ifp>=String.lengthbufthenOk(List.revacc)elsematchbuf.[p]with|'\t'->fail"Unexpected tag character"|'\n'->fail"Unexpected EOL character"|' '->tokenizeacc(p+1)|';'->tokenize(`SEMICOLON::acc)(p+1)|'"'->next_quote(p+1)>>=funq->letlen=q-p-1intokenize(`QUOTED(p+1,len)::acc)(q+1)|_->token_endp>>=funq->tokenize(`TOKEN(p,q-p+1)::acc)(q+1)andnext_quotep=ifp>=String.lengthbufthenfail"Reached end of string but expected dquote"elsematchbuf.[p]with|'"'->Okp|_->next_quote(p+1)andtoken_endp=ifp>=String.lengthbufthenOk(p-1)elsematchbuf.[p]with|' '->Ok(p-1)|_->token_end(p+1)inletrecattributeacc=function|`TOKEN(p,q)::(`QUOTED(r,s)|`TOKEN(r,s))::rest->letatt=String.subbuf~pos:p~len:q,[String.subbuf~pos:r~len:s]inattribute_tail(att::acc)rest|_->failf"Cannot parse attributes: %s"bufandattribute_tailacc=function|[]|[`SEMICOLON]->Ok(List.revacc)|`SEMICOLON::rest->attributeaccrest|_->failf"Cannot parse attributes: %s"bufintokenize[]0>>=funtokens->attribute[]tokenslet%test"Gff.parse_gtf_attributes1"=Caml.(parse_gtf_attributes{|gene_id "FBgn0031081"|}=Ok[("gene_id",["FBgn0031081"])])let%test"Gff.parse_gtf_attributes"=Caml.(parse_gtf_attributes{|gene_id "FBgn0031081"; gene_symbol "Nep3"; transcript_id "FBtr0070000"; transcript_symbol "Nep3-RA";|}=Ok[("gene_id",["FBgn0031081"]);("gene_symbol",["Nep3"]);("transcript_id",["FBtr0070000"]);("transcript_symbol",["Nep3-RA"])])letparse_fieldsparse_attributes=function|[seqname;source;feature;start_pos;stop_pos;score;strand;phase;attributes]->letopenResultinparse_intstart_pos>>=funstart_pos->parse_intstop_pos>>=funstop_pos->parse_opt'parse_intphase>>=funphase->parse_opt'parse_floatscore>>=funscore->parse_strandstrand>>=funstrand->parse_attributesattributes>>=funattributes->Ok{seqname;source=parse_optFn.idsource;feature=parse_optFn.idfeature;start_pos;stop_pos;score;strand;phase;attributes;}|_->fail"Incorrect number of fields"letitem_of_lineparse_attributesline=match(line:Line.t:>string)with|""->fail"Empty line"|line->ifChar.(line.[0]='#')thenOk(`Comment(String.subline~pos:1~len:(String.lengthline-1)))elseletopenResultinletfields=String.split~on:'\t'lineinparse_fieldsparse_attributesfields>>|funr->`Recordrletgff3_item_of_lineline=item_of_lineparse_gff3_attributeslineletgtf_item_of_lineline=item_of_lineparse_gtf_attributeslineletline_of_itemversion=function|`Commentc->Line.of_string_unsafe("#"^c)|`Recordt->letescape=matchversionwith|`three->(funs->Uri.pct_encodes)|`two->sprintf"%S"inletoptescapeo=Option.value_map~default:"."o~f:escapeinString.concat~sep:"\t"[t.seqname;optescapet.source;Option.value~default:"."t.feature;Int.to_stringt.start_pos;Int.to_stringt.stop_pos;Option.value_map~default:"."~f:(sprintf"%g")t.score;(matcht.strandwith`Plus->"+"|`Minus->"-"|`Not_stranded->"."|`Unknown->"?");Option.value_map~default:"."~f:(sprintf"%d")t.phase;String.concat~sep:";"(List.mapt.attributes~f:(fun(k,v)->matchversionwith|`three->sprintf"%s=%s"(Uri.pct_encodek)(List.mapv~f:Uri.pct_encode|>String.concat~sep:",")|`two->sprintf"%s %s"k(List.mapv~f:escape|>String.concat~sep:",")));]|>Line.of_string_unsafe