Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source
Source file utf8_string.ml
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576(* Copyright (C) 2015-2016 Bloomberg Finance L.P.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* In addition to the permissions granted to you by the LGPL, you may combine
* or link a "work that uses the Library" with a publicly distributed version
* of this file to produce a combined library or application, then distribute
* that combined work under the terms of your choosing, with no requirement
* to comply with the obligations normally placed on you by section 4 of the
* LGPL version 3 (or the corresponding section of a later version of the LGPL
* should you choose to use a later version).
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *)openPpxlibletvalid_hexx=matchxwith'0'..'9'|'a'..'f'|'A'..'F'->true|_->falseletmerge_loc(l:location)(r:location)=ifl.loc_ghostthenrelseifr.loc_ghostthenlelsematch(l,r)with|{loc_start;_},{loc_end;_}(* TODO: improve*)->{loc_start;loc_end;loc_ghost=false}moduleUtf8_string=structtypeerror=|Invalid_code_point|Unterminated_backslash|Invalid_hex_escape|Invalid_unicode_escapeletpp_errorfmterr=Format.pp_print_stringfmt@@matcherrwith|Invalid_code_point->"Invalid code point"|Unterminated_backslash->"\\ ended unexpectedly"|Invalid_hex_escape->"Invalid \\x escape"|Invalid_unicode_escape->"Invalid \\u escape"typeexn+=Errorofint(* offset *)*errorleterror~locerror=raise(Error(loc,error))(* let error ~loc ~pos error = *)(* [%expr *)(* [%ocaml.error *)(* [%e *)(* Ast_helper.Exp.constant *)(* (Pconst_string (Format.asprintf "%a" pp_error error, loc, None))]]] *)(* Note the [loc] really should be the utf8-offset, it has nothing to do with our
escaping mechanism
*)(* we can not just print new line in ES5
seems we don't need
escape "\b" "\f"
we need escape "\n" "\r" since
ocaml multiple-line allows [\n]
visual input while es5 string
does not*)letreccheck_and_transform(loc:int)(buf:Buffer.t)(s:string)(byte_offset:int)(s_len:int)=ifbyte_offset=s_lenthen()elseletcurrent_char=s.[byte_offset]inmatchAst_utf8_string.classifycurrent_charwith|Single92(* '\\' *)->escape_code(loc+1)bufs(byte_offset+1)s_len|Single34->Buffer.add_stringbuf"\\\"";check_and_transform(loc+1)bufs(byte_offset+1)s_len|Single10->Buffer.add_stringbuf"\\n";check_and_transform(loc+1)bufs(byte_offset+1)s_len|Single13->Buffer.add_stringbuf"\\r";check_and_transform(loc+1)bufs(byte_offset+1)s_len|Single_->Buffer.add_charbufcurrent_char;check_and_transform(loc+1)bufs(byte_offset+1)s_len|Invalid|Cont_->error~locInvalid_code_point|Leading(n,_)->leti'=Ast_utf8_string.nexts~remaining:nbyte_offsetinifi'<0thenerror~locInvalid_code_pointelse(fork=byte_offsettoi'doBuffer.add_charbufs.[k]done;check_and_transform(loc+1)bufs(i'+1)s_len)(* we share the same escape sequence with js *)andescape_codelocbufsoffsets_len=ifoffset>=s_lenthenerror~locUnterminated_backslashelseBuffer.add_charbuf'\\';letcur_char=s.[offset]inmatchcur_charwith|'\\'|'b'|'t'|'n'|'v'|'f'|'r'|'0'|'$'->Buffer.add_charbufcur_char;check_and_transform(loc+1)bufs(offset+1)s_len|'u'->Buffer.add_charbufcur_char;unicode(loc+1)bufs(offset+1)s_len|'x'->Buffer.add_charbufcur_char;two_hex(loc+1)bufs(offset+1)s_len|_->(* Regular characters, like `a` in `\a`,
* are valid escape sequences *)Buffer.add_charbufcur_char;check_and_transform(loc+1)bufs(offset+1)s_lenandtwo_hexlocbufsoffsets_len=ifoffset+1>=s_lenthenerror~locInvalid_hex_escape;(*Location.raise_errorf ~loc "\\x need at least two chars";*)leta,b=(s.[offset],s.[offset+1])inifvalid_hexa&&valid_hexbthen(Buffer.add_charbufa;Buffer.add_charbufb;check_and_transform(loc+2)bufs(offset+2)s_len)elseerror~locInvalid_hex_escape(*Location.raise_errorf ~loc "%c%c is not a valid hex code" a b*)andunicodelocbufsoffsets_len=ifoffset+3>=s_lenthenerror~locInvalid_unicode_escape(*Location.raise_errorf ~loc "\\u need at least four chars"*);leta0,a1,a2,a3=(s.[offset],s.[offset+1],s.[offset+2],s.[offset+3])inifvalid_hexa0&&valid_hexa1&&valid_hexa2&&valid_hexa3then(Buffer.add_charbufa0;Buffer.add_charbufa1;Buffer.add_charbufa2;Buffer.add_charbufa3;check_and_transform(loc+4)bufs(offset+4)s_len)elseerror~locInvalid_unicode_escape(*Location.raise_errorf ~loc "%c%c%c%c is not a valid unicode point"
a0 a1 a2 a3 *)(* http://www.2ality.com/2015/01/es6-strings.html
console.log('\uD83D\uDE80'); (* ES6*)
console.log('\u{1F680}');
*)lettransform_tests=lets_len=String.lengthsinletbuf=Buffer.create(s_len*2)incheck_and_transform0bufs0s_len;Buffer.contentsbuflettransforms=lets_len=String.lengthsinletbuf=Buffer.create(s_len*2)incheck_and_transform0bufs0s_len;Buffer.contentsbufendmoduleInterp=structtypeerror=|Invalid_code_point|Unterminated_backslash|Invalid_escape_codeofchar|Invalid_hex_escape|Invalid_unicode_escape|Unterminated_variable|Unmatched_paren|Invalid_syntax_of_varofstringtypekind=String|Varofint*int(* [Var (loffset, roffset)]
For parens it used to be (2,-1)
for non-parens it used to be (1,0)
*)(* Note the position is about code point *)typepos={lnum:int;offset:int;byte_bol:int;(* Note it actually needs to be in sync with OCaml's lexing semantics *)}typesegment={start:pos;finish:pos;kind:kind;content:string}typesegments=segmentlisttypecxt={mutablesegment_start:pos;buf:Buffer.t;s_len:int;mutablesegments:segments;mutablepos_bol:int;(* record the abs position of current beginning line *)mutablebyte_bol:int;mutablepos_lnum:int;(* record the line number *)}typeexn+=Errorofpos*pos*errorletpp_errorfmterr=Format.pp_print_stringfmt@@matcherrwith|Invalid_code_point->"Invalid code point"|Unterminated_backslash->"\\ ended unexpectedly"|Invalid_escape_codec->"Invalid escape code: "^String.make1c|Invalid_hex_escape->"Invalid \\x escape"|Invalid_unicode_escape->"Invalid \\u escape"|Unterminated_variable->"$ unterminated"|Unmatched_paren->"Unmatched paren"|Invalid_syntax_of_vars->"`"^s^"' is not a valid syntax of interpolated identifer"letvalid_lead_identifier_charx=matchxwith'a'..'z'|'_'->true|_->falseletvalid_identifier_charx=matchxwith|'a'..'z'|'A'..'Z'|'0'..'9'|'_'|'\''->true|_->false(* Invariant: [valid_lead_identifier] has to be [valid_identifier] *)letvalid_identifier=letfor_all_from=letrecunsafe_for_all_ranges~start~finishp=start>finish||p(String.unsafe_getsstart)&&unsafe_for_all_ranges~start:(start+1)~finishpinfunsstartp->letlen=String.lengthsinifstart<0theninvalid_arg"for_all_from"elseunsafe_for_all_ranges~start~finish:(len-1)pinfuns->lets_len=String.lengthsinifs_len=0thenfalseelsevalid_lead_identifier_chars.[0]&&for_all_froms1valid_identifier_char(* let is_space x =
match x with
| ' ' | '\n' | '\t' -> true
| _ -> false *)(*
FIXME: multiple line offset
if there is no line offset. Note {|{j||} border will never trigger a new line
*)letupdate_positionborder({lnum;offset;byte_bol}:pos)(pos:Lexing.position)=iflnum=0then{poswithpos_cnum=pos.pos_cnum+border+offset}(* When no newline, the column number is [border + offset] *)else{poswithpos_lnum=pos.pos_lnum+lnum;pos_bol=pos.pos_cnum+border+byte_bol;pos_cnum=pos.pos_cnum+border+byte_bol+offset;(* when newline, the column number is [offset] *)}letupdateborder(start:pos)(finish:pos)(loc:Location.t):Location.t=letstart_pos=loc.loc_startin{locwithloc_start=update_positionborderstartstart_pos;loc_end=update_positionborderfinishstart_pos;}letupdate_newline~byte_bolloccxt=cxt.pos_lnum<-cxt.pos_lnum+1;cxt.pos_bol<-loc;cxt.byte_bol<-byte_bolletpos_errorcxt~locerror=raise(Error(cxt.segment_start,{lnum=cxt.pos_lnum;offset=loc-cxt.pos_bol;byte_bol=cxt.byte_bol;},error))letadd_var_segmentcxtlocloffsetroffset=letcontent=Buffer.contentscxt.bufinBuffer.clearcxt.buf;letnext_loc={lnum=cxt.pos_lnum;offset=loc-cxt.pos_bol;byte_bol=cxt.byte_bol;}inifvalid_identifiercontentthen(cxt.segments<-{start=cxt.segment_start;finish=next_loc;kind=Var(loffset,roffset);content;}::cxt.segments;cxt.segment_start<-next_loc)elsepos_errorcxt~loc(Invalid_syntax_of_varcontent)letadd_str_segmentcxtloc=letcontent=Buffer.contentscxt.bufinBuffer.clearcxt.buf;letnext_loc={lnum=cxt.pos_lnum;offset=loc-cxt.pos_bol;byte_bol=cxt.byte_bol;}incxt.segments<-{start=cxt.segment_start;finish=next_loc;kind=String;content}::cxt.segments;cxt.segment_start<-next_locletreccheck_and_transform(loc:int)sbyte_offset({s_len;buf;_}ascxt:cxt)=ifbyte_offset=s_lenthenadd_str_segmentcxtlocelseletcurrent_char=s.[byte_offset]inmatchAst_utf8_string.classifycurrent_charwith|Single92(* '\\' *)->escape_code(loc+1)s(byte_offset+1)cxt|Single34->Buffer.add_stringbuf"\\\"";check_and_transform(loc+1)s(byte_offset+1)cxt|Single10->Buffer.add_stringbuf"\\n";letloc=loc+1inletbyte_offset=byte_offset+1inupdate_newline~byte_bol:byte_offsetloccxt;(* Note variable could not have new-line *)check_and_transformlocsbyte_offsetcxt|Single13->Buffer.add_stringbuf"\\r";check_and_transform(loc+1)s(byte_offset+1)cxt|Single36->(* $ *)add_str_segmentcxtloc;letoffset=byte_offset+1inifoffset>=s_lenthenpos_error~loccxtUnterminated_variableelseletcur_char=s.[offset]inifcur_char='('thenexpect_var_paren(loc+2)s(offset+1)cxtelseexpect_simple_var(loc+1)soffsetcxt|Single_->Buffer.add_charbufcurrent_char;check_and_transform(loc+1)s(byte_offset+1)cxt|Invalid|Cont_->pos_error~loccxtInvalid_code_point|Leading(n,_)->leti'=Ast_utf8_string.nexts~remaining:nbyte_offsetinifi'<0thenpos_errorcxt~locInvalid_code_pointelse(fork=byte_offsettoi'doBuffer.add_charbufs.[k]done;check_and_transform(loc+1)s(i'+1)cxt)(*Lets keep identifier simple, so that we could generating a function easier in the future
for example
let f = [%fn{| $x + $y = $x_add_y |}]
*)andexpect_simple_varlocsoffset({buf;s_len;_}ascxt)=letv=refoffsetinifnot(offset<s_len&&valid_lead_identifier_chars.[offset])thenpos_errorcxt~loc(Invalid_syntax_of_varString.empty)else(while!v<s_len&&valid_identifier_chars.[!v]do(* TODO*)letcur_char=s.[!v]inBuffer.add_charbufcur_char;incrvdone;letadded_length=!v-offsetinletloc=added_length+locinadd_var_segmentcxtloc10;check_and_transformlocs(added_length+offset)cxt)andexpect_var_parenlocsoffset({buf;s_len;_}ascxt)=letv=refoffsetinwhile!v<s_len&&s.[!v]<>')'doletcur_char=s.[!v]inBuffer.add_charbufcur_char;incrvdone;letadded_length=!v-offsetinletloc=added_length+1+locinif!v<s_len&&s.[!v]=')'then(add_var_segmentcxtloc2(-1);check_and_transformlocs(added_length+1+offset)cxt)elsepos_errorcxt~locUnmatched_paren(* we share the same escape sequence with js *)andescape_codelocsoffset({buf;s_len;_}ascxt)=ifoffset>=s_lenthenpos_errorcxt~locUnterminated_backslashelseBuffer.add_charbuf'\\';letcur_char=s.[offset]inmatchcur_charwith|'\\'|'b'|'t'|'n'|'v'|'f'|'r'|'0'|'$'->Buffer.add_charbufcur_char;check_and_transform(loc+1)s(offset+1)cxt|'u'->Buffer.add_charbufcur_char;unicode(loc+1)s(offset+1)cxt|'x'->Buffer.add_charbufcur_char;two_hex(loc+1)s(offset+1)cxt|_->pos_errorcxt~loc(Invalid_escape_codecur_char)andtwo_hexlocsoffset({buf;s_len;_}ascxt)=ifoffset+1>=s_lenthenpos_errorcxt~locInvalid_hex_escape;leta,b=(s.[offset],s.[offset+1])inifvalid_hexa&&valid_hexbthen(Buffer.add_charbufa;Buffer.add_charbufb;check_and_transform(loc+2)s(offset+2)cxt)elsepos_errorcxt~locInvalid_hex_escapeandunicodelocsoffset({buf;s_len;_}ascxt)=ifoffset+3>=s_lenthenpos_errorcxt~locInvalid_unicode_escape;leta0,a1,a2,a3=(s.[offset],s.[offset+1],s.[offset+2],s.[offset+3])inifvalid_hexa0&&valid_hexa1&&valid_hexa2&&valid_hexa3then(Buffer.add_charbufa0;Buffer.add_charbufa1;Buffer.add_charbufa2;Buffer.add_charbufa3;check_and_transform(loc+4)s(offset+4)cxt)elsepos_errorcxt~locInvalid_unicode_escape(* TODO: test empty var $() $ failure,
Allow identifers x.A.y *)openAst_helper(* Longident.parse "Pervasives.^" *)letconcat_ident:Longident.t=Ldot(Lident"Stdlib","^")(* FIXME: remove deps on `Pervasives` *)(* JS string concatMany *)(* Ldot (Ldot (Lident "Js", "String2"), "concat") *)(* Longident.parse "Js.String.make" *)letto_string_ident:Longident.t=Ldot(Ldot(Lident"Js","String2"),"make")letescaped_j_delimiter="*j"(* not user level syntax allowed *)letunescaped_j_delimiter="j"letunescaped_js_delimiter="js"letescaped=Someescaped_j_delimiterletborder=String.length"{j|"letauxloc(segment:segment)~to_string_ident:Parsetree.expression=matchsegmentwith|{start;finish;kind;content}->(matchkindwith|String->letloc=updateborderstartfinishlocinExp.constant(Pconst_string(content,loc,escaped))|Var(soffset,foffset)->letloc={locwithloc_start=update_position(soffset+border)startloc.loc_start;loc_end=update_position(foffset+border)finishloc.loc_start;}in[%expr[%eExp.ident~loc{loc;txt=to_string_ident}][%eExp.ident~loc{loc;txt=Lidentcontent}]])letconcat_expa_locx~(lhs:Parsetree.expression):Parsetree.expression=letloc=merge_loca_loclhs.pexp_locin[%expr[%eExp.ident{txt=concat_ident;loc}][%elhs][%eauxlocx~to_string_ident:(Longident.Ldot(Lident"Obj","magic"))]](* Invariant: the [lhs] is always of type string *)letrechandle_segmentsloc(rev_segments:segmentlist)=matchrev_segmentswith|[]->Exp.constant(Pconst_string("",loc,escaped))|[segment]->auxlocsegment~to_string_ident(* string literal *)|{content="";_}::rest->handle_segmentslocrest|a::rest->concat_exploca~lhs:(handle_segmentslocrest)lettransform_interplocs=lets_len=String.lengthsinletbuf=Buffer.create(s_len*2)inletcxt:cxt={segment_start={lnum=0;offset=0;byte_bol=0};buf;s_len;segments=[];pos_lnum=0;byte_bol=0;pos_bol=0;}incheck_and_transform0s0cxt;handle_segmentsloccxt.segmentslettransform_tests=lets_len=String.lengthsinletbuf=Buffer.create(s_len*2)inletcxt={segment_start={lnum=0;offset=0;byte_bol=0};buf;s_len;segments=[];pos_lnum=0;byte_bol=0;pos_bol=0;}incheck_and_transform0s0cxt;List.revcxt.segmentslettransform(e:Parsetree.expression)slocdelim:Parsetree.expression=ifString.equaldelimunescaped_js_delimiterthen{ewithpexp_desc=Pexp_constant(Pconst_string(Utf8_string.transforms,loc,escaped));}elseifString.equaldelimunescaped_j_delimiterthentransform_interpe.pexp_locselseelettransform(e:Parsetree.expression)slocdelim:Parsetree.expression=trytransformeslocdelimwith|Utf8_string.Error(offset,error)->[%expr[%ocaml.error[%eExp.constant(Pconst_string(Format.asprintf"Offset: %d, %a"offsetUtf8_string.pp_errorerror,loc,None))]]]|Error(start,pos,error)->letloc=updateborderstartposlocin[%expr[%ocaml.error[%eExp.constant(Pconst_string(Format.asprintf"%a"pp_errorerror,loc,None))]]]end