%% @author Bob Ippolito %% @copyright 2007 Mochi Media, Inc. %% %% Permission is hereby granted, free of charge, to any person obtaining a %% copy of this software and associated documentation files (the "Software"), %% to deal in the Software without restriction, including without limitation %% the rights to use, copy, modify, merge, publish, distribute, sublicense, %% and/or sell copies of the Software, and to permit persons to whom the %% Software is furnished to do so, subject to the following conditions: %% %% The above copyright notice and this permission notice shall be included in %% all copies or substantial portions of the Software. %% %% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR %% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, %% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL %% THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER %% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING %% FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER %% DEALINGS IN THE SOFTWARE. %% @doc Loosely tokenizes and generates parse trees for HTML 4. -module(floki_mochi_html). -export([ tokens/1, parse/2 ]). -ifdef(TEST). -export([destack/1, destack/2, is_singleton/1]). -endif. %% This is a macro to placate syntax highlighters.. %% $\" -define(QUOTE, $\"). %% $\' -define(SQUOTE, $\'). -define(ADV_COL(S, N), S#decoder{ column = N + S#decoder.column, offset = N + S#decoder.offset }). -define(INC_COL(S), S#decoder{ column = 1 + S#decoder.column, offset = 1 + S#decoder.offset }). -define(INC_LINE(S), S#decoder{ column = 1, line = 1 + S#decoder.line, offset = 1 + S#decoder.offset }). -define(INC_CHAR(S, C), case C of $\n -> S#decoder{ column = 1, line = 1 + S#decoder.line, offset = 1 + S#decoder.offset }; _ -> S#decoder{ column = 1 + S#decoder.column, offset = 1 + S#decoder.offset } end ). -define(IS_WHITESPACE(C), (C =:= $\s orelse C =:= $\t orelse C =:= $\r orelse C =:= $\n) ). -define(IS_LETTER(C), ((C >= $A andalso C =< $Z) orelse (C >= $a andalso C =< $z)) ). -define(IS_LITERAL_SAFE(C), ((C >= $A andalso C =< $Z) orelse (C >= $a andalso C =< $z) orelse (C >= $0 andalso C =< $9)) ). -define(PROBABLE_CLOSE(C), (C =:= $> orelse ?IS_WHITESPACE(C)) ). -record(decoder, { line = 1, column = 1, offset = 0 }). %% @type html_node() = {string(), [html_attr()], [html_node() | string()]} %% @type html_attr() = {string(), string()} %% @type html_token() = html_data() | start_tag() | end_tag() | inline_html() | html_comment() | html_doctype() %% @type html_data() = {data, string(), Whitespace::boolean()} %% @type start_tag() = {start_tag, Name, [html_attr()], Singleton::boolean()} %% @type end_tag() = {end_tag, Name} %% @type html_comment() = {comment, Comment} %% @type html_doctype() = {doctype, [Doctype]} %% @type inline_html() = {'=', iolist()} %% External API. %% @spec parse(string() | binary(), list()) -> html_node() %% @doc tokenize and then transform the token stream into a HTML tree. %% %% The following option is supported: %% %%

`attributes_as_maps`: %% When `true`, it configures the parser to use maps for the attributes. %% It is `false` by default, which means attributes are going to be represented %% as a list of tuples. %%

parse(Input, Opts) -> parse_tokens(tokens(Input), Opts). %% @spec parse_tokens([html_token()]) -> html_node() %% @doc Transform the output of tokens(Doc) into a HTML tree. parse_tokens(Tokens, Opts) when is_list(Tokens) andalso is_list(Opts) -> %% Skip over doctype, processing instructions [{start_tag, Tag, Attrs, false} | Rest] = find_document(Tokens, normal), {Tree, _} = tree(Rest, [norm({Tag, Attrs}, Opts)], Opts), Tree. find_document(Tokens = [{start_tag, _Tag, _Attrs, false} | _Rest], Mode) -> maybe_add_html_tag(Tokens, Mode); find_document([{doctype, [<<"html">>]} | Rest], _Mode) -> find_document(Rest, html5); find_document([_T | Rest], Mode) -> find_document(Rest, Mode); find_document([], _Mode) -> []. maybe_add_html_tag(Tokens = [{start_tag, Tag, _Attrs, false} | _], html5) when Tag =/= <<"html">> -> [{start_tag, <<"html">>, [], false} | Tokens]; maybe_add_html_tag(Tokens, _Mode) -> Tokens. %% @spec tokens(StringOrBinary) -> [html_token()] %% @doc Transform the input UTF-8 HTML into a token stream. tokens(Input) -> tokens(iolist_to_binary(Input), #decoder{}, []). tokens(B, S = #decoder{offset = O}, Acc) -> case B of <<_:O/binary>> -> lists:reverse(Acc); _ -> {Tag, S1} = tokenize(B, S), case parse_flag(Tag) of script -> {Tag2, S2} = tokenize_script(B, S1), tokens(B, S2, [Tag2, Tag | Acc]); style -> {Tag2, S2} = tokenize_style(B, S1), tokens(B, S2, [Tag2, Tag | Acc]); title -> {Tag2, S2} = tokenize_title(B, S1), tokens(B, S2, [Tag2, Tag | Acc]); textarea -> {Tag2, S2} = tokenize_textarea(B, S1), tokens(B, S2, [Tag2, Tag | Acc]); none -> tokens(B, S1, [Tag | Acc]) end end. parse_flag({start_tag, B, _, false}) -> case B of <<"script">> -> script; <<"style">> -> style; <<"title">> -> title; <<"textarea">> -> textarea; _ -> none end; parse_flag(_) -> none. tokenize(B, S = #decoder{offset = O}) -> case B of <<_:O/binary, "", _/binary>> -> Len = O - Start, <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin, {{comment, Raw}, ?ADV_COL(S, 3)}; <<_:O/binary, C, _/binary>> -> tokenize_comment(Bin, ?INC_CHAR(S, C), Start); <<_:Start/binary, Raw/binary>> -> {{comment, Raw}, S} end. tokenize_script(Bin, S = #decoder{offset = O}) -> tokenize_script(Bin, S, O). tokenize_script(Bin, S = #decoder{offset = O}, Start) -> case Bin of %% Just a look-ahead, we want the end_tag separately <<_:O/binary, $<, $/, SS, CC, RR, II, PP, TT, ZZ, _/binary>> when (SS =:= $s orelse SS =:= $S) andalso (CC =:= $c orelse CC =:= $C) andalso (RR =:= $r orelse RR =:= $R) andalso (II =:= $i orelse II =:= $I) andalso (PP =:= $p orelse PP =:= $P) andalso (TT =:= $t orelse TT =:= $T) andalso ?PROBABLE_CLOSE(ZZ) -> Len = O - Start, <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin, {{data, Raw, false}, S}; <<_:O/binary, C, _/binary>> -> tokenize_script(Bin, ?INC_CHAR(S, C), Start); <<_:Start/binary, Raw/binary>> -> {{data, Raw, false}, S} end. tokenize_style(Bin, S = #decoder{offset = O}) -> tokenize_style(Bin, S, O). tokenize_style(Bin, S = #decoder{offset = O}, Start) -> case Bin of %% Just a look-ahead, we want the end_tag separately <<_:O/binary, $<, $/, SS, TT, YY, LL, EE, ZZ, _/binary>> when (SS =:= $s orelse SS =:= $S) andalso (TT =:= $t orelse TT =:= $T) andalso (YY =:= $y orelse YY =:= $Y) andalso (LL =:= $l orelse LL =:= $L) andalso (EE =:= $e orelse EE =:= $E) andalso ?PROBABLE_CLOSE(ZZ) -> Len = O - Start, <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin, {{data, Raw, false}, S}; <<_:O/binary, C, _/binary>> -> tokenize_style(Bin, ?INC_CHAR(S, C), Start); <<_:Start/binary, Raw/binary>> -> {{data, Raw, false}, S} end. tokenize_title(Bin, S = #decoder{offset = O}) -> tokenize_title(Bin, S, O). tokenize_title(Bin, S = #decoder{offset = O}, Start) -> case Bin of %% Just a look-ahead, we want the end_tag separately <<_:O/binary, $<, $/, TT, II, TT2, LL, EE, ZZ, _/binary>> when (TT =:= $t orelse TT =:= $T) andalso (II =:= $i orelse II =:= $I) andalso (TT2 =:= $t orelse TT2 =:= $T) andalso (LL =:= $l orelse LL =:= $L) andalso (EE =:= $e orelse EE =:= $E) andalso ?PROBABLE_CLOSE(ZZ) -> Len = O - Start, <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin, {{data, Raw, false}, S}; <<_:O/binary, C, _/binary>> -> tokenize_title(Bin, ?INC_CHAR(S, C), Start); <<_:Start/binary, Raw/binary>> -> {{data, Raw, false}, S} end. tokenize_textarea(Bin, S = #decoder{offset = O}) -> tokenize_textarea(Bin, S, O). tokenize_textarea(Bin, S = #decoder{offset = O}, Start) -> case Bin of %% Just a look-ahead, we want the end_tag separately <<_:O/binary, $<, $/, TT, EE, XX, TT2, AA, RR, EE2, AA2, ZZ, _/binary>> when (TT =:= $t orelse TT =:= $T) andalso (EE =:= $e orelse EE =:= $E) andalso (XX =:= $x orelse XX =:= $X) andalso (TT2 =:= $t orelse TT2 =:= $T) andalso (AA =:= $a orelse AA =:= $A) andalso (RR =:= $r orelse RR =:= $R) andalso (EE2 =:= $e orelse EE2 =:= $E) andalso (AA2 =:= $a orelse AA2 =:= $A) andalso ?PROBABLE_CLOSE(ZZ) -> Len = O - Start, <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin, {{data, Raw, false}, S}; <<_:O/binary, C, _/binary>> -> tokenize_textarea(Bin, ?INC_CHAR(S, C), Start); <<_:Start/binary, Raw/binary>> -> {{data, Raw, false}, S} end.