api-v2/deps/nimble_parsec/lib/nimble_parsec.ex
2025-04-16 10:03:13 -03:00

2127 lines
65 KiB
Elixir

defmodule NimbleParsec do
@moduledoc "README.md"
|> File.read!()
|> String.split("<!-- MDOC !-->")
|> Enum.fetch!(1)
defmacrop is_combinator(combinator) do
quote do
is_list(unquote(combinator))
end
end
@doc """
Defines a parser (and a combinator) with the given `name` and `opts`.
The parser is a function that receives two arguments, the binary
to be parsed and a set of options. You can consult the documentation
of the generated parser function for more information.
This function will also define a combinator that can be used as
`parsec(name)` when building other parsers. See `parsec/2` for
more information on invoking compiled combinators.
## Beware!
`defparsec/3` is executed during compilation. This means you can't
invoke a function defined in the same module. The following will error
because the `date` function has not yet been defined:
defmodule MyParser do
import NimbleParsec
def date do
integer(4)
|> ignore(string("-"))
|> integer(2)
|> ignore(string("-"))
|> integer(2)
end
defparsec :date, date()
end
This can be solved in different ways. You may simply
compose a long parser using variables. For example:
defmodule MyParser do
import NimbleParsec
date =
integer(4)
|> ignore(string("-"))
|> integer(2)
|> ignore(string("-"))
|> integer(2)
defparsec :date, date
end
Alternatively, you may define a `Helpers` module with many
convenience combinators, and then invoke them in your parser
module:
defmodule MyParser.Helpers do
import NimbleParsec
def date do
integer(4)
|> ignore(string("-"))
|> integer(2)
|> ignore(string("-"))
|> integer(2)
end
end
defmodule MyParser do
import NimbleParsec
import MyParser.Helpers
defparsec :date, date()
end
The approach of using helper modules is the favorite way
of composing parsers in `NimbleParsec`.
## Options
* `:inline` - when true, inlines clauses that work as redirection for
other clauses. Settings this may improve runtime performance at the
cost of increased compilation time and bytecode size
* `:debug` - when true, writes generated clauses to `:stderr` for debugging
* `:export_combinator` - make the underlying combinator function public
so it can be used as part of `parsec/1` from other modules
* `:export_metadata` - export metadata necessary to use this parser
combinator to generate inputs
"""
defmacro defparsec(name, combinator, opts \\ []) do
visibility =
quote do
if opts[:export_combinator], do: :def, else: :defp
end
compile(:def, visibility, name, combinator, opts)
end
@doc """
Defines a private parser (and a combinator) with the given `name` and `opts`.
The same as `defparsec/3` but the parsing function is private.
"""
defmacro defparsecp(name, combinator, opts \\ []) do
compile(:defp, :defp, name, combinator, opts)
end
@doc """
Defines a combinator with the given `name` and `opts`.
It is similar to `defparsec/3` except it does not define
an entry-point parsing function, just the combinator function
to be used with `parsec/2`.
"""
defmacro defcombinator(name, combinator, opts \\ []) do
compile(nil, :def, name, combinator, opts)
end
@doc """
Defines a combinator with the given `name` and `opts`.
It is similar to `defparsecp/3` except it does not define
an entry-point parsing function, just the combinator function
to be used with `parsec/2`.
"""
defmacro defcombinatorp(name, combinator, opts \\ []) do
compile(nil, :defp, name, combinator, opts)
end
defp compile(parser_kind, combinator_kind, name, combinator, opts) do
prelude =
quote do
opts = unquote(opts)
combinator_kind = unquote(combinator_kind)
end
combinator =
quote bind_quoted: [
parser_kind: parser_kind,
name: name,
combinator: combinator
] do
{defs, inline} = NimbleParsec.Compiler.compile(name, combinator, opts)
NimbleParsec.Recorder.record(
__MODULE__,
parser_kind,
combinator_kind,
name,
defs,
inline,
opts
)
if opts[:export_metadata] do
def __nimble_parsec__(unquote(name)),
do: unquote(combinator |> Enum.reverse() |> Macro.escape())
end
if inline != [] do
@compile {:inline, inline}
end
if combinator_kind == :def do
for {name, args, guards, body} <- defs do
def unquote(name)(unquote_splicing(args)) when unquote(guards), do: unquote(body)
end
else
for {name, args, guards, body} <- defs do
defp unquote(name)(unquote_splicing(args)) when unquote(guards), do: unquote(body)
end
end
end
parser = compile_parser(name, parser_kind)
quote do
unquote(prelude)
unquote(parser)
unquote(combinator)
end
end
defp compile_parser(_name, nil) do
:ok
end
defp compile_parser(name, :def) do
quote bind_quoted: [name: name] do
{doc, spec, {name, args, guards, body}} = NimbleParsec.Compiler.entry_point(name)
Module.get_attribute(__MODULE__, :doc) || @doc doc
@spec unquote(spec)
def unquote(name)(unquote_splicing(args)) when unquote(guards), do: unquote(body)
end
end
defp compile_parser(name, :defp) do
quote bind_quoted: [name: name] do
{_doc, spec, {name, args, guards, body}} = NimbleParsec.Compiler.entry_point(name)
@spec unquote(spec)
defp unquote(name)(unquote_splicing(args)) when unquote(guards), do: unquote(body)
end
end
@opaque t :: [combinator]
@type bin_modifier :: :integer | :utf8 | :utf16 | :utf32
@type range :: inclusive_range | exclusive_range
@type inclusive_range :: Range.t() | char
@type exclusive_range :: {:not, Range.t()} | {:not, char}
@type min_and_max :: {:min, non_neg_integer} | {:max, pos_integer}
@type call :: mfargs | fargs | atom
@type mfargs :: {module, atom, args :: [term]}
@type fargs :: {atom, args :: [term]}
@type gen_times :: Range.t() | non_neg_integer | nil
@type gen_weights :: [pos_integer] | nil
@type opts :: Keyword.t()
# Steps to add a new combinator:
#
# 1. Update the combinator type below
# 2. Update the compiler with combinator
# 3. Update the compiler with label step
#
@typep combinator :: bound_combinator | maybe_bound_combinator | unbound_combinator
@typep bound_combinator ::
{:bin_segment, [inclusive_range], [exclusive_range], bin_modifier}
| {:string, binary}
| {:bytes, pos_integer}
| :eos
@typep maybe_bound_combinator ::
{:label, t, binary}
| {:traverse, t, :pre | :post | :constant, [mfargs]}
@typep unbound_combinator ::
{:choice, [t], gen_weights}
| {:eventually, t}
| {:lookahead, t, :positive | :negative}
| {:parsec, atom | {module, atom}}
| {:repeat, t, mfargs, gen_times}
| {:times, t, pos_integer}
@doc ~S"""
Generate a random binary from the given parsec.
Let's see an example:
import NimbleParsec
generate(choice([string("foo"), string("bar")]))
The command above will return either "foo" or "bar". `generate/1`
is often used with pre-defined parsecs. In this case, the
`:export_metadata` flag must be set:
defmodule SomeModule do
import NimbleParsec
defparsec :parse,
choice([string("foo"), string("bar")]),
export_metadata: true
end
# Reference the parsec and generate from it
NimbleParsec.parsec({SomeModule, :parse})
|> NimbleParsec.generate()
|> IO.puts()
`generate/1` can often run forever for recursive algorithms.
Read the notes below and make use of the `gen_weight` and `gen_times`
option to certain parsecs to control the recursion depth.
## Notes
Overall, there is no guarantee over the generated output, except
that it will generate a binary that is parseable by the parsec
itself, but even this guarantee may be broken by parsers that have
custom validations. Keep in mind the following:
* `generate/1` is not compatible with NimbleParsecs dumped via
`mix nimble_parsec.compile`;
* `parsec/2` requires the referenced parsec to set `export_metadata: true`
on its definition;
* `choice/2` will be generated evenly. You can pass `:gen_weights`
as a list of positive integer weights to balance your choices.
This is particularly important for recursive algorithms;
* `repeat/2` and `repeat_while/3` will repeat between 0 and 3 times unless
a `:gen_times` option is given to these operations. `times/3` without a `:max`
will also additionally repeat between 0 and 3 times unless `:gen_times` is given.
The `:gen_times` option can either be an integer as the number of times to
repeat or a range where a random value in the range will be picked;
* `eventually/2` always generates the eventually parsec immediately;
* `lookahead/2` and `lookahead_not/2` are simply discarded;
* Validations done in any of the traverse definitions are not taken into account
by the generator. Therefore, if a parsec does validations, the generator may
generate binaries invalid to said parsec;
"""
def generate(parsecs) do
parsecs
|> Enum.reverse()
|> generate(nil, [])
|> IO.iodata_to_binary()
end
defp generate([{:parsec, fun} | _parsecs], nil, _acc) when is_atom(fun) do
raise "cannot generate parsec(#{inspect(fun)}), use a remote parsec instead"
end
defp generate([{:parsec, fun} | parsecs], mod, acc) when is_atom(fun) do
generate([{:parsec, {mod, fun}} | parsecs], mod, acc)
end
defp generate([{:parsec, {mod, fun}} | outer_parsecs], outer_mod, acc) do
gen = generate(gen_export(mod, fun), mod, [])
generate(outer_parsecs, outer_mod, [gen | acc])
end
defp generate([{:string, string} | parsecs], mod, acc) do
generate(parsecs, mod, [string | acc])
end
defp generate([{:bin_segment, inclusive, exclusive, modifier} | parsecs], mod, acc) do
gen = gen_bin_segment(inclusive, exclusive)
gen =
if modifier == :integer,
do: gen,
else: :unicode.characters_to_binary([gen], :unicode, modifier)
generate(parsecs, mod, [gen | acc])
end
defp generate([:eos | parsecs], mod, acc) do
if parsecs == [] do
generate([], mod, acc)
else
raise ArgumentError, "found :eos not at the end of parsecs"
end
end
defp generate([{:traverse, t, _, _} | parsecs], mod, acc) do
generate(t ++ parsecs, mod, acc)
end
defp generate([{:label, t, _} | parsecs], mod, acc) do
generate(t ++ parsecs, mod, acc)
end
defp generate([{:choice, choices, weights} | parsecs], mod, acc) do
pick = if weights, do: weighted_random(choices, weights), else: list_random(choices)
gen = generate(pick, mod, [])
generate(parsecs, mod, [gen | acc])
end
defp generate([{:lookahead, _, _} | parsecs], mod, acc) do
generate(parsecs, mod, acc)
end
defp generate([{:repeat, t, _, gen} | parsecs], mod, acc) do
generate(parsecs, mod, gen_times(t, int_random(gen), mod, acc))
end
defp generate([{:times, t, max} | parsecs], mod, acc) do
generate(parsecs, mod, gen_times(t, Enum.random(0..max), mod, acc))
end
defp generate([{:bytes, count} | parsecs], mod, acc) do
bytes = bytes_random(count)
generate(parsecs, mod, [bytes | acc])
end
defp generate([], _mod, acc), do: Enum.reverse(acc)
defp gen_export(mod, fun) do
unless Code.ensure_loaded?(mod) do
raise "cannot handle parsec(#{inspect({mod, fun})}) because #{inspect(mod)} is not available"
end
try do
mod.__nimble_parsec__(fun)
rescue
_ ->
raise "cannot handle parsec(#{inspect({mod, fun})}) because #{inspect(mod)} " <>
"did not set :export_metadata when defining #{fun}"
end
end
defp gen_times(_t, 0, _mod, acc), do: acc
defp gen_times(t, n, mod, acc) do
gen = generate(t, mod, [])
gen_times(t, n - 1, mod, [gen | acc])
end
defp gen_bin_segment(inclusive, exclusive) do
gen =
if(inclusive == [], do: [0..255], else: inclusive)
|> list_random()
|> int_random()
if Enum.any?(exclusive, &exclude_bin_segment?(&1, gen)) do
gen_bin_segment(inclusive, exclusive)
else
gen
end
end
defp exclude_bin_segment?({:not, _.._//_ = range}, gen), do: gen in range
defp exclude_bin_segment?({:not, char}, gen) when is_integer(char), do: char == gen
defp int_random(nil), do: Enum.random(0..3)
defp int_random(_.._//_ = range), do: Enum.random(range)
defp int_random(int) when is_integer(int), do: int
# Enum.random uses reservoir sampling but our lists are short, so we use length + fetch!
defp list_random(list) when is_list(list),
do: Enum.fetch!(list, :rand.uniform(length(list)) - 1)
defp weighted_random(list, weights) do
weighted_random(list, weights, :rand.uniform(Enum.sum(weights)))
end
defp weighted_random([elem | _], [weight | _], chosen) when chosen <= weight,
do: elem
defp weighted_random([_ | list], [weight | weights], chosen),
do: weighted_random(list, weights, chosen - weight)
defp bytes_random(count) when is_integer(count) do
:crypto.strong_rand_bytes(count)
end
@doc ~S"""
Returns an empty combinator.
An empty combinator cannot be compiled on its own.
"""
@spec empty() :: t
def empty() do
[]
end
@doc """
Invokes an already compiled combinator with name `name` in the
same module.
Every parser defined via `defparsec/3` or `defparsecp/3` can be
used as combinator. However, the `defparsec/3` and `defparsecp/3`
functions also define an entry-point parsing function, as implied
by their names. If you want to define a combinator with the sole
purpose of using it in combinator, use `defcombinatorp/3` instead.
## Use cases
`parsec/2` is useful to implement recursive definitions.
Note, while `parsec/2` can be used to compose smaller combinators,
the preferred mechanism for doing composition is via regular functions
and not via `parsec/2`. Let's see a practical example. Imagine
that you have this module:
defmodule MyParser do
import NimbleParsec
date =
integer(4)
|> ignore(string("-"))
|> integer(2)
|> ignore(string("-"))
|> integer(2)
time =
integer(2)
|> ignore(string(":"))
|> integer(2)
|> ignore(string(":"))
|> integer(2)
|> optional(string("Z"))
defparsec :datetime, date |> ignore(string("T")) |> concat(time), debug: true
end
Now imagine that you want to break `date` and `time` apart
into helper functions, as you use them in other occasions.
Generally speaking, you should **NOT** do this:
defmodule MyParser do
import NimbleParsec
defcombinatorp :date,
integer(4)
|> ignore(string("-"))
|> integer(2)
|> ignore(string("-"))
|> integer(2)
defcombinatorp :time,
integer(2)
|> ignore(string(":"))
|> integer(2)
|> ignore(string(":"))
|> integer(2)
|> optional(string("Z"))
defparsec :datetime,
parsec(:date) |> ignore(string("T")) |> concat(parsec(:time))
end
The reason why the above is not recommended is because each
`parsec/2` combinator ends-up adding a stacktrace entry during
parsing, which affects the ability of `NimbleParsec` to optimize
code. If the goal is to compose combinators, you can do so
with modules and functions:
defmodule MyParser.Helpers do
import NimbleParsec
def date do
integer(4)
|> ignore(string("-"))
|> integer(2)
|> ignore(string("-"))
|> integer(2)
end
def time do
integer(2)
|> ignore(string(":"))
|> integer(2)
|> ignore(string(":"))
|> integer(2)
|> optional(string("Z"))
end
end
defmodule MyParser do
import NimbleParsec
import MyParser.Helpers
defparsec :datetime,
date() |> ignore(string("T")) |> concat(time())
end
The implementation above will be able to compile to the most
efficient format as possible without forcing new stacktrace
entries.
The only situation where you should use `parsec/2` for composition
is when a large parser is used over and over again in a way
compilation times are high. In this sense, you can use `parsec/2`
to improve compilation time at the cost of runtime performance.
By using `parsec/2`, the tree size built at compile time will be
reduced although runtime performance is degraded as `parsec`
introduces a stacktrace entry.
## Remote combinators
You can also reference combinators in other modules by passing
a tuple with the module name and a function to `parsec/2` as follows:
defmodule RemoteCombinatorModule do
defcombinator :upcase_unicode, utf8_char([...long, list, of, unicode, chars...])
end
defmodule LocalModule do
# Parsec that depends on `:upcase_A`
defparsec :parsec_name,
...
|> ascii_char([?a..?Z])
|> parsec({RemoteCombinatorModule, :upcase_unicode})
end
Remote combinators are useful when breaking the compilation of
large modules apart in order to use Elixir's ability to compile
modules in parallel.
## Examples
A good example of using `parsec` is with recursive parsers.
A limited but recursive XML parser could be written as follows:
defmodule SimpleXML do
import NimbleParsec
tag = ascii_string([?a..?z, ?A..?Z], min: 1)
text = ascii_string([not: ?<], min: 1)
opening_tag =
ignore(string("<"))
|> concat(tag)
|> ignore(string(">"))
closing_tag =
ignore(string("</"))
|> concat(tag)
|> ignore(string(">"))
defparsec :xml,
opening_tag
|> repeat(lookahead_not(string("</")) |> choice([parsec(:xml), text]))
|> concat(closing_tag)
|> wrap()
end
SimpleXML.xml("<foo>bar</foo>")
#=> {:ok, [["foo", "bar", "foo"]], "", %{}, {1, 0}, 14}
In the example above, `defparsec/3` has defined the entry-point
parsing function as well as a combinator which we have invoked
with `parsec(:xml)`.
In many cases, however, you want to define recursive combinators
without the entry-point parsing function. We can do this by
replacing `defparsec/3` by `defcombinatorp`:
defcombinatorp :xml,
opening_tag
|> repeat(lookahead_not(string("</")) |> choice([parsec(:xml), text]))
|> concat(closing_tag)
|> wrap()
When using `defcombinatorp`, you can no longer invoke
`SimpleXML.xml(xml)` as there is no associated parsing function.
You can only access the combinator above via `parsec/2`.
"""
@spec parsec(name :: atom) :: t
@spec parsec(t, name :: atom) :: t
@spec parsec({module, function_name :: atom}) :: t
@spec parsec(t, {module, function_name :: atom}) :: t
def parsec(combinator \\ empty(), name)
def parsec(combinator, name) when is_combinator(combinator) and is_atom(name) do
[{:parsec, name} | combinator]
end
def parsec(combinator, {module, function})
when is_combinator(combinator) and is_atom(module) and is_atom(function) do
[{:parsec, {module, function}} | combinator]
end
@doc ~S"""
Defines a single ASCII codepoint in the given ranges.
`ranges` is a list containing one of:
* a `min..max` range expressing supported codepoints
* a `codepoint` integer expressing a supported codepoint
* `{:not, min..max}` expressing not supported codepoints
* `{:not, codepoint}` expressing a not supported codepoint
## Examples
defmodule MyParser do
import NimbleParsec
defparsec :digit_and_lowercase,
empty()
|> ascii_char([?0..?9])
|> ascii_char([?a..?z])
end
MyParser.digit_and_lowercase("1a")
#=> {:ok, [?1, ?a], "", %{}, {1, 0}, 2}
MyParser.digit_and_lowercase("a1")
#=> {:error, "expected ASCII character in the range '0' to '9', followed by ASCII character in the range 'a' to 'z'", "a1", %{}, {1, 0}, 0}
"""
@spec ascii_char([range]) :: t
@spec ascii_char(t, [range]) :: t
def ascii_char(combinator \\ empty(), ranges)
when is_combinator(combinator) and is_list(ranges) do
{inclusive, exclusive} = split_ranges!(ranges, "ascii_char")
bin_segment(combinator, inclusive, exclusive, :integer)
end
@doc ~S"""
Defines a single UTF-8 codepoint in the given ranges.
`ranges` is a list containing one of:
* a `min..max` range expressing supported codepoints
* a `codepoint` integer expressing a supported codepoint
* `{:not, min..max}` expressing not supported codepoints
* `{:not, codepoint}` expressing a not supported codepoint
## Examples
defmodule MyParser do
import NimbleParsec
defparsec :digit_and_utf8,
empty()
|> utf8_char([?0..?9])
|> utf8_char([])
end
MyParser.digit_and_utf8("1é")
#=> {:ok, [?1, ?é], "", %{}, {1, 0}, 2}
MyParser.digit_and_utf8("a1")
#=> {:error, "expected utf8 codepoint in the range '0' to '9', followed by utf8 codepoint", "a1", %{}, {1, 0}, 0}
"""
@spec utf8_char([range]) :: t
@spec utf8_char(t, [range]) :: t
def utf8_char(combinator \\ empty(), ranges)
when is_combinator(combinator) and is_list(ranges) do
{inclusive, exclusive} = split_ranges!(ranges, "utf8_char")
bin_segment(combinator, inclusive, exclusive, :utf8)
end
@doc ~S"""
Adds a label to the combinator to be used in error reports.
## Examples
defmodule MyParser do
import NimbleParsec
defparsec :digit_and_lowercase,
empty()
|> ascii_char([?0..?9])
|> ascii_char([?a..?z])
|> label("digit followed by lowercase letter")
end
MyParser.digit_and_lowercase("1a")
#=> {:ok, [?1, ?a], "", %{}, {1, 0}, 2}
MyParser.digit_and_lowercase("a1")
#=> {:error, "expected a digit followed by lowercase letter", "a1", %{}, {1, 0}, 0}
"""
@spec label(t, String.t()) :: t
@spec label(t, t, String.t()) :: t
def label(combinator \\ empty(), to_label, label)
when is_combinator(combinator) and is_combinator(to_label) and is_binary(label) do
non_empty!(to_label, "label")
[{:label, Enum.reverse(to_label), label} | combinator]
end
@doc ~S"""
Defines an integer combinator with exact length or `min` and `max`
length.
If you want an integer of unknown size, use `integer(min: 1)`.
This combinator does not parse the sign and is always on base 10.
## Examples
With exact length:
defmodule MyParser do
import NimbleParsec
defparsec :two_digits_integer, integer(2)
end
MyParser.two_digits_integer("123")
#=> {:ok, [12], "3", %{}, {1, 0}, 2}
MyParser.two_digits_integer("1a3")
#=> {:error, "expected ASCII character in the range '0' to '9', followed by ASCII character in the range '0' to '9'", "1a3", %{}, {1, 0}, 0}
With min and max:
defmodule MyParser do
import NimbleParsec
defparsec :two_digits_integer, integer(min: 2, max: 4)
end
MyParser.two_digits_integer("123")
#=> {:ok, [123], "", %{}, {1, 0}, 2}
MyParser.two_digits_integer("1a3")
#=> {:error, "expected ASCII character in the range '0' to '9', followed by ASCII character in the range '0' to '9'", "1a3", %{}, {1, 0}, 0}
If the size of the integer has a min and max close to each other, such as
from 2 to 4 or from 1 to 2, using choice may emit more efficient code:
choice([integer(4), integer(3), integer(2)])
Note you should start from bigger to smaller.
"""
@spec integer(pos_integer | [min_and_max]) :: t
@spec integer(t, pos_integer | [min_and_max]) :: t
def integer(combinator \\ empty(), count_or_opts)
def integer(combinator, count)
when is_combinator(combinator) and is_integer(count) do
validate_min_and_max!(count, 1)
min_max_compile_runtime_chars(
combinator,
ascii_char([?0..?9]),
count,
:__compile_integer__,
:__runtime_integer__,
[]
)
end
def integer(combinator, opts)
when is_combinator(combinator) and is_list(opts) do
# Read the minimum and maximum value to ensure the presence of at least one character
{min_val, max_val} = validate_min_and_max!(opts, 1)
opts = opts |> Keyword.put(:min, min_val) |> Keyword.put(:max, max_val)
min_max_compile_runtime_chars(
combinator,
ascii_char([?0..?9]),
opts,
:__compile_integer__,
:__runtime_integer__,
[]
)
end
@doc ~S"""
Defines an ASCII string combinator with an exact length or `min` and `max`
length.
The `ranges` specify the allowed characters in the ASCII string.
See `ascii_char/2` for more information.
If you want a string of unknown size, use `ascii_string(ranges, min: 1)`.
If you want a literal string, use `string/2`.
## Examples
defmodule MyParser do
import NimbleParsec
defparsec :two_lowercase_letters, ascii_string([?a..?z], 2)
end
MyParser.two_lowercase_letters("abc")
#=> {:ok, ["ab"], "c", %{}, {1, 0}, 2}
"""
@spec ascii_string([range], pos_integer | [min_and_max]) :: t
@spec ascii_string(t, [range], pos_integer | [min_and_max]) :: t
def ascii_string(combinator \\ empty(), range, count_or_opts)
when is_combinator(combinator) and is_list(range) and
(is_integer(count_or_opts) or is_list(count_or_opts)) do
min_max_compile_runtime_chars(
combinator,
ascii_char(range),
count_or_opts,
:__compile_string__,
:__runtime_string__,
[quote(do: integer)]
)
end
@doc ~S"""
Defines an UTF8 string combinator with of exact length or `min` and `max`
codepoint length.
The `ranges` specify the allowed characters in the UTF8 string.
See `utf8_char/2` for more information.
If you want a string of unknown size, use `utf8_string(ranges, min: 1)`.
If you want a literal string, use `string/2`.
Note that the combinator matches on codepoints, not graphemes. Therefore
results may vary depending on whether the input is in `nfc` or `nfd`
normalized form.
## Examples
defmodule MyParser do
import NimbleParsec
defparsec :two_letters, utf8_string([], 2)
end
MyParser.two_letters("áé")
#=> {:ok, ["áé"], "", %{}, {1, 0}, 3}
"""
@spec utf8_string([range], pos_integer | [min_and_max]) :: t
@spec utf8_string(t, [range], pos_integer | [min_and_max]) :: t
def utf8_string(combinator \\ empty(), range, count_or_opts)
when is_combinator(combinator) and is_list(range) and
(is_integer(count_or_opts) or is_list(count_or_opts)) do
min_max_compile_runtime_chars(
combinator,
utf8_char(range),
count_or_opts,
:__compile_string__,
:__runtime_string__,
[quote(do: utf8)]
)
end
@doc ~S"""
Defines an end of string combinator.
The end of string does not produce a token and can be parsed multiple times.
This function is useful to avoid having to check for an empty remainder after
a successful parse.
## Examples
defmodule MyParser do
import NimbleParsec
defparsec :letter_pairs, utf8_string([], 2) |> repeat() |> eos()
end
MyParser.letter_pairs("hi")
#=> {:ok, ["hi"], "", %{}, {1, 0}, 2}
MyParser.letter_pairs("hello")
#=> {:error, "expected end of string", "o", %{}, {1, 0}, 4}
"""
@spec eos :: t
@spec eos(t) :: t
def eos(combinator \\ empty()) do
[:eos | combinator]
end
@doc ~S"""
Concatenates two combinators.
## Examples
defmodule MyParser do
import NimbleParsec
defparsec :digit_upper_lower_plus,
concat(
concat(ascii_char([?0..?9]), ascii_char([?A..?Z])),
concat(ascii_char([?a..?z]), ascii_char([?+..?+]))
)
end
MyParser.digit_upper_lower_plus("1Az+")
#=> {:ok, [?1, ?A, ?z, ?+], "", %{}, {1, 0}, 4}
"""
@spec concat(t, t) :: t
def concat(left, right) when is_combinator(left) and is_combinator(right) do
right ++ left
end
@doc """
Duplicates the combinator `to_duplicate` `n` times.
"""
@spec duplicate(t, non_neg_integer) :: t
@spec duplicate(t, t, non_neg_integer) :: t
def duplicate(combinator \\ empty(), to_duplicate, n)
def duplicate(combinator, to_duplicate, 0)
when is_combinator(combinator) and is_combinator(to_duplicate) do
combinator
end
def duplicate(combinator, to_duplicate, n)
when is_combinator(combinator) and is_combinator(to_duplicate) and is_integer(n) and n >= 1 do
Enum.reduce(1..n, combinator, fn _, acc -> to_duplicate ++ acc end)
end
@doc """
Puts the result of the given combinator as the first element
of a tuple with the `byte_offset` as second element.
`byte_offset` is a non-negative integer.
"""
@spec byte_offset(t) :: t
@spec byte_offset(t, t) :: t
def byte_offset(combinator \\ empty(), to_wrap)
when is_combinator(combinator) and is_combinator(to_wrap) do
quoted_post_traverse(combinator, to_wrap, {__MODULE__, :__byte_offset__, []})
end
@doc """
Puts the result of the given combinator as the first element
of a tuple with the `line` as second element.
`line` is a tuple where the first element is the current line
and the second element is the byte offset immediately after
the newline.
"""
@spec line(t) :: t
@spec line(t, t) :: t
def line(combinator \\ empty(), to_wrap)
when is_combinator(combinator) and is_combinator(to_wrap) do
quoted_post_traverse(combinator, to_wrap, {__MODULE__, :__line__, []})
end
@doc ~S"""
Traverses the combinator results with the remote or local function `call`.
`call` is either a `{module, function, args}` representing
a remote call, a `{function, args}` representing a local call
or an atom `function` representing `{function, []}`.
The function given in `call` will receive 5 additional arguments.
The rest of the parsed binary, the parser results to be post_traversed,
the parser context, the current line and the current offset will
be prepended to the given `args`. The `args` will be injected at
the compile site and therefore must be escapable via `Macro.escape/1`.
The line and offset will represent the location after the combinators.
To retrieve the position before the combinators, use `pre_traverse/3`.
The `call` must return a tuple `{rest, acc, context}` with list of
results to be added to the accumulator as first argument and a context
as second argument. It may also return `{:error, reason}` to stop
processing. Notice the received results are in reverse order and
must be returned in reverse order too.
The number of elements returned does not need to be
the same as the number of elements given.
This is a low-level function for changing the parsed result.
On top of this function, other functions are built, such as
`map/3` if you want to map over each individual element and
not worry about ordering, `reduce/3` to reduce all elements
into a single one, `replace/3` if you want to replace the
parsed result by a single value and `ignore/2` if you want to
ignore the parsed result.
## Examples
defmodule MyParser do
import NimbleParsec
defparsec :letters_to_chars,
ascii_char([?a..?z])
|> ascii_char([?a..?z])
|> ascii_char([?a..?z])
|> post_traverse({:join_and_wrap, ["-"]})
defp join_and_wrap(rest, args, context, _line, _offset, joiner) do
{rest, args |> Enum.join(joiner) |> List.wrap(), context}
end
end
MyParser.letters_to_chars("abc")
#=> {:ok, ["99-98-97"], "", %{}, {1, 0}, 3}
"""
@spec post_traverse(t, call) :: t
@spec post_traverse(t, t, call) :: t
def post_traverse(combinator \\ empty(), to_post_traverse, call)
when is_combinator(combinator) and is_combinator(to_post_traverse) do
compile_call!([], call, "post_traverse")
quoted_post_traverse(combinator, to_post_traverse, {__MODULE__, :__post_traverse__, [call]})
end
@doc """
The same as `post_traverse/3` but receives the line and offset
from before the wrapped combinators.
`post_traverse/3` should be preferred as it keeps less stack
information. Use `pre_traverse/3` only if you have to access
the line and offset from before the given combinators.
"""
@spec pre_traverse(t, call) :: t
@spec pre_traverse(t, t, call) :: t
def pre_traverse(combinator \\ empty(), to_pre_traverse, call)
when is_combinator(combinator) and is_combinator(to_pre_traverse) do
compile_call!([], call, "pre_traverse")
quoted_pre_traverse(combinator, to_pre_traverse, {__MODULE__, :__pre_traverse__, [call]})
end
@doc ~S"""
Checks if a combinator is ahead.
If it succeeds, it continues as usual, otherwise it aborts the
closest `choice/2`, `repeat/2`, etc. If there is no closest
operation to abort, then it errors.
Note a lookahead never changes the accumulated output nor the
context.
## Examples
For example, imagine you want to parse a language that has the
keywords "if" and "while" and identifiers made of any letters or
number, where keywords and identifiers can be separated by a
single white space:
defmodule IfWhileLang do
import NimbleParsec
keyword =
choice([
string("if") |> replace(:if),
string("while") |> replace(:while)
])
identifier =
ascii_string([?a..?z, ?A..?Z, ?0..?9], min: 1)
defparsec :expr, repeat(choice([keyword, identifier]) |> optional(string(" ")))
end
The issue with the implementation above is that the following
will parse:
IfWhileLang.expr("iffy")
{:ok, [:if, "fy"], "", %{}, {1, 0}, 4}
However, "iffy" should be treated as a full identifier. We could
solve this by inverting the order of `keyword` and `identifier`
in `:expr` but that means "if" itself will be considered an identifier
and not a keyword. To solve this, we need lookaheads.
One option is to check that after the keyword we either have an
empty string OR the end of the string:
keyword =
choice([
string("if") |> replace(:if),
string("while") |> replace(:while)
])
|> lookahead(choice([string(" "), eos()]))
However, in this case, a negative lookahead may be clearer,
and we can assert that we don't have any identifier character after
the keyword:
keyword =
choice([
string("if") |> replace(:if),
string("while") |> replace(:while)
])
|> lookahead_not(ascii_char([?a..?z, ?A..?Z, ?0..?9]))
Now we get the desired result back:
IfWhileLang.expr("iffy")
#=> {:ok, ["iffy"], "", %{}, {1, 0}, 4}
IfWhileLang.expr("if fy")
#=> {:ok, [:if, " ", "fy"], "", %{}, {1, 0}, 5}
"""
@spec lookahead(t) :: t
@spec lookahead(t, t) :: t
def lookahead(combinator \\ empty(), to_lookahead)
when is_combinator(combinator) and is_combinator(to_lookahead) do
[{:lookahead, Enum.reverse(to_lookahead), :positive} | combinator]
end
@doc ~S"""
Checks if a combinator is not ahead.
If it succeeds, it aborts the closest `choice/2`, `repeat/2`, etc.
Otherwise it continues as usual. If there is no closest operation
to abort, then it errors.
Note a lookahead never changes the accumulated output nor the
context.
For an example, see `lookahead/2`.
"""
@spec lookahead_not(t) :: t
@spec lookahead_not(t, t) :: t
def lookahead_not(combinator \\ empty(), to_lookahead)
when is_combinator(combinator) and is_combinator(to_lookahead) do
[{:lookahead, Enum.reverse(to_lookahead), :negative} | combinator]
end
@doc """
Invokes `call` to emit the AST that post traverses the `to_post_traverse`
combinator results.
This is similar to `post_traverse/3`. In `post_traverse/3`, `call` is
invoked to process the combinator results. In here, it is invoked to
emit AST that in its turn will process the combinator results.
The invoked function must return the same types as `post_traverse/3`.
`call` is a `{module, function, args}` and it will receive 5
additional arguments. The AST representation of the rest of the
parsed binary, the parser results, context, line and offset will
be prepended to `args`. `call` is invoked at compile time and is
useful in combinators that avoid injecting runtime dependencies.
The line and offset will represent the location after the combinators.
To retrieve the position before the combinators, use `quoted_pre_traverse/3`.
This function must be used only when you want to emit code that
has no runtime dependencies in other modules. In most cases,
using `post_traverse/3` is better, since it doesn't work on ASTs
and instead works at runtime.
"""
@spec quoted_post_traverse(t, mfargs) :: t
@spec quoted_post_traverse(t, t, mfargs) :: t
def quoted_post_traverse(combinator \\ empty(), to_post_traverse, {_, _, _} = call)
when is_combinator(combinator) and is_combinator(to_post_traverse) do
quoted_traverse(combinator, to_post_traverse, :post, call)
end
@doc """
The same as `quoted_post_traverse/3` but receives the line and offset
from before the wrapped combinators.
`quoted_post_traverse/3` should be preferred as it keeps less stack
information. Use `quoted_pre_traverse/3` only if you have to access
the line and offset from before the given combinators.
"""
@spec quoted_pre_traverse(t, mfargs) :: t
@spec quoted_pre_traverse(t, t, mfargs) :: t
def quoted_pre_traverse(combinator \\ empty(), to_pre_traverse, {_, _, _} = call)
when is_combinator(combinator) and is_combinator(to_pre_traverse) do
quoted_traverse(combinator, to_pre_traverse, :pre, call)
end
@doc ~S"""
Maps over the combinator results with the remote or local function in `call`.
`call` is either a `{module, function, args}` representing
a remote call, a `{function, args}` representing a local call
or an atom `function` representing `{function, []}`.
Each parser result will be invoked individually for the `call`.
Each result be prepended to the given `args`. The `args` will
be injected at the compile site and therefore must be escapable
via `Macro.escape/1`.
See `post_traverse/3` for a low level version of this function.
## Examples
defmodule MyParser do
import NimbleParsec
defparsec :letters_to_string_chars,
ascii_char([?a..?z])
|> ascii_char([?a..?z])
|> ascii_char([?a..?z])
|> map({Integer, :to_string, []})
end
MyParser.letters_to_string_chars("abc")
#=> {:ok, ["97", "98", "99"], "", %{}, {1, 0}, 3}
"""
@spec map(t, call) :: t
@spec map(t, t, call) :: t
def map(combinator \\ empty(), to_map, call)
when is_combinator(combinator) and is_combinator(to_map) do
var = Macro.var(:var, __MODULE__)
call = compile_call!([var], call, "map")
quoted_post_traverse(combinator, to_map, {__MODULE__, :__map__, [var, call]})
end
@doc ~S"""
Reduces over the combinator results with the remote or local function in `call`.
`call` is either a `{module, function, args}` representing
a remote call, a `{function, args}` representing a local call
or an atom `function` representing `{function, []}`.
The parser results to be reduced will be prepended to the
given `args`. The `args` will be injected at the compile site
and therefore must be escapable via `Macro.escape/1`.
See `post_traverse/3` for a low level version of this function.
## Examples
defmodule MyParser do
import NimbleParsec
defparsec :letters_to_reduced_chars,
ascii_char([?a..?z])
|> ascii_char([?a..?z])
|> ascii_char([?a..?z])
|> reduce({Enum, :join, ["-"]})
end
MyParser.letters_to_reduced_chars("abc")
#=> {:ok, ["97-98-99"], "", %{}, {1, 0}, 3}
"""
@spec reduce(t, call) :: t
@spec reduce(t, t, call) :: t
def reduce(combinator \\ empty(), to_reduce, call)
when is_combinator(combinator) and is_combinator(to_reduce) do
compile_call!([], call, "reduce")
quoted_post_traverse(combinator, to_reduce, {__MODULE__, :__reduce__, [call]})
end
@doc """
Wraps the results of the given combinator in `to_wrap` in a list.
"""
@spec wrap(t) :: t
@spec wrap(t, t) :: t
def wrap(combinator \\ empty(), to_wrap)
when is_combinator(combinator) and is_combinator(to_wrap) do
quoted_post_traverse(combinator, to_wrap, {__MODULE__, :__wrap__, []})
end
@doc """
Tags the result of the given combinator in `to_tag` in a tuple with
`tag` as first element.
## Examples
defmodule MyParser do
import NimbleParsec
defparsec :integer, integer(min: 1) |> tag(:integer)
end
MyParser.integer("1234")
#=> {:ok, [integer: [1234]], "", %{}, {1, 0}, 4}
Notice, however, that the integer result is wrapped in a list, because
the parser is expected to emit multiple tokens. When you are sure that
only a single token is emitted, you should use `unwrap_and_tag/3`.
"""
@spec tag(t, term) :: t
@spec tag(t, t, term) :: t
def tag(combinator \\ empty(), to_tag, tag)
when is_combinator(combinator) and is_combinator(to_tag) do
quoted_post_traverse(combinator, to_tag, {__MODULE__, :__tag__, [Macro.escape(tag)]})
end
@doc """
Unwraps and tags the result of the given combinator in `to_tag` in a tuple with
`tag` as first element.
## Examples
defmodule MyParser do
import NimbleParsec
defparsec :integer, integer(min: 1) |> unwrap_and_tag(:integer)
end
MyParser.integer("1234")
#=> {:ok, [integer: 1234], "", %{}, {1, 0}, 4}
In case the combinator emits greater than one token, an error will be raised.
See `tag/3` for more information.
"""
@spec unwrap_and_tag(t, term) :: t
@spec unwrap_and_tag(t, t, term) :: t
def unwrap_and_tag(combinator \\ empty(), to_tag, tag)
when is_combinator(combinator) and is_combinator(to_tag) do
quoted_post_traverse(
combinator,
to_tag,
{__MODULE__, :__unwrap_and_tag__, [Macro.escape(tag)]}
)
end
@doc """
Inspects the combinator state given to `to_debug` with the given `opts`.
"""
@spec debug(t) :: t
@spec debug(t, t) :: t
def debug(combinator \\ empty(), to_debug)
when is_combinator(combinator) and is_combinator(to_debug) do
quoted_pre_traverse(combinator, to_debug, {__MODULE__, :__debug__, []})
end
@doc ~S"""
Defines a string binary value.
## Examples
defmodule MyParser do
import NimbleParsec
defparsec :string_t, string("T")
end
MyParser.string_t("T")
#=> {:ok, ["T"], "", %{}, {1, 0}, 1}
MyParser.string_t("not T")
#=> {:error, "expected a string \"T\"", "not T", %{}, {1, 0}, 0}
"""
@spec string(binary) :: t
@spec string(t, binary) :: t
def string(combinator \\ empty(), binary)
when is_combinator(combinator) and is_binary(binary) do
[{:string, binary} | combinator]
end
@doc """
Ignores the output of combinator given in `to_ignore`.
## Examples
defmodule MyParser do
import NimbleParsec
defparsec :ignorable, string("T") |> ignore() |> integer(2)
end
MyParser.ignorable("T12")
#=> {:ok, [12], "", %{}, {1, 0}, 2}
"""
@spec ignore(t) :: t
@spec ignore(t, t) :: t
def ignore(combinator \\ empty(), to_ignore)
when is_combinator(combinator) and is_combinator(to_ignore) do
if to_ignore == empty() do
to_ignore
else
quoted_constant_traverse(combinator, to_ignore, {__MODULE__, :__constant__, [[]]})
end
end
@doc """
Replaces the output of combinator given in `to_replace` by a single value.
The `value` will be injected at the compile site
and therefore must be escapable via `Macro.escape/1`.
## Examples
defmodule MyParser do
import NimbleParsec
defparsec :replaceable, string("T") |> replace("OTHER") |> integer(2, 2)
end
MyParser.replaceable("T12")
#=> {:ok, ["OTHER", 12], "", %{}, {1, 0}, 2}
"""
@spec replace(t, term) :: t
@spec replace(t, t, term) :: t
def replace(combinator \\ empty(), to_replace, value)
when is_combinator(combinator) and is_combinator(to_replace) do
value = Macro.escape(value)
quoted_constant_traverse(combinator, to_replace, {__MODULE__, :__constant__, [[value]]})
end
@doc """
Allows the combinator given on `to_repeat` to appear zero or more times.
Beware! Since `repeat/2` allows zero entries, it cannot be used inside
`choice/2`, because it will always succeed and may lead to unused function
warnings since any further choice won't ever be attempted. For example,
because `repeat/2` always succeeds, the `string/2` combinator below it
won't ever run:
choice([
repeat(ascii_char([?a..?z])),
string("OK")
])
Instead of `repeat/2`, you may want to use `times/3` with the flags `:min`
and `:max`.
Also beware! If you attempt to repeat a combinator that can match nothing,
like `optional/2`, `repeat/2` will not terminate. For example, consider
this combinator:
repeat(optional(utf8_char([?a])))
This combinator will never terminate because `repeat/2` chooses the empty
option of `optional/2` every time. Since the goal of the parser above is
to parse 0 or more `?a` characters, it can be represented by
`repeat(utf8_char([?a]))`, because `repeat/2` allows 0 or more matches.
## Examples
defmodule MyParser do
import NimbleParsec
defparsec :repeat_lower, repeat(ascii_char([?a..?z]))
end
MyParser.repeat_lower("abcd")
#=> {:ok, [?a, ?b, ?c, ?d], "", %{}, {1, 0}, 4}
MyParser.repeat_lower("1234")
#=> {:ok, [], "1234", %{}, {1, 0}, 0}
"""
@spec repeat(t) :: t
@spec repeat(t, t) :: t
@spec repeat(t, opts) :: t
@spec repeat(t, t, opts) :: t
def repeat(combinator \\ empty(), to_repeat, opts \\ [])
when is_combinator(combinator) and is_combinator(to_repeat) and is_list(opts) do
non_empty!(to_repeat, "repeat")
quoted_repeat_while(combinator, to_repeat, {__MODULE__, :__cont_context__, []}, opts)
end
@doc """
Marks the given combinator should appear eventually.
Any other data before the combinator appears is discarded.
If the combinator never appears, then it is an error.
**Note:** this can be potentially a very expensive operation
as it executes the given combinator byte by byte until finding
an eventual match or ultimately failing. For example, if you
are looking for an integer, it is preferable to discard
everything that is not an integer
ignore(ascii_string([not: ?0..?9]))
rather than eventually look for an integer
eventually(ascii_string([?0..?9]))
## Examples
defmodule MyParser do
import NimbleParsec
hour = integer(min: 1, max: 2)
defparsec :extract_hour, eventually(hour)
end
MyParser.extract_hour("let's meet at 12?")
#=> {:ok, [12], "?", %{}, {1, 0}, 16}
"""
@spec eventually(t) :: t
@spec eventually(t, t) :: t
def eventually(combinator \\ empty(), eventually)
when is_combinator(combinator) and is_combinator(eventually) do
non_empty!(eventually, "eventually")
[{:eventually, Enum.reverse(eventually)} | combinator]
end
@doc ~S"""
Repeats while the given remote or local function `while` returns
`{:cont, context}`.
If the combinator `to_repeat` stops matching, then the whole repeat
loop stops successfully, hence it is important to assert the terminated
value after repeating.
In case repetition should stop, `while` must return `{:halt, context}`.
`while` is either a `{module, function, args}` representing
a remote call, a `{function, args}` representing a local call
or an atom `function` representing `{function, []}`.
The function given in `while` will receive 4 additional arguments.
The `rest` of the binary to be parsed, the parser context, the
current line and the current offset will be prepended to the
given `args`. The `args` will be injected at the compile site
and therefore must be escapable via `Macro.escape/1`.
## Examples
defmodule MyParser do
import NimbleParsec
defparsec :string_with_quotes,
ascii_char([?"])
|> repeat_while(
choice([
~S(\") |> string() |> replace(?"),
utf8_char([])
]),
{:not_quote, []}
)
|> ascii_char([?"])
|> reduce({List, :to_string, []})
defp not_quote(<<?", _::binary>>, context, _, _), do: {:halt, context}
defp not_quote(_, context, _, _), do: {:cont, context}
end
MyParser.string_with_quotes(~S("string with quotes \" inside"))
{:ok, ["\"string with quotes \" inside\""], "", %{}, {1, 0}, 30}
Note you can use `lookahead/2` and `lookahead_not/2` with
`repeat/2` (instead of `repeat_while/3`) to write a combinator
that repeats while a combinator matches (or does not match).
For example, the same combinator above could be written as:
defmodule MyParser do
import NimbleParsec
defparsec :string_with_quotes,
ascii_char([?"])
|> repeat(
lookahead_not(ascii_char([?"]))
|> choice([
~S(\") |> string() |> replace(?"),
utf8_char([])
])
)
|> ascii_char([?"])
|> reduce({List, :to_string, []})
end
MyParser.string_with_quotes(~S("string with quotes \" inside"))
{:ok, ["\"string with quotes \" inside\""], "", %{}, {1, 0}, 30}
However, `repeat_while` is still useful when the condition to
repeat comes from the context passed around.
"""
@spec repeat_while(t, call) :: t
@spec repeat_while(t, t, call) :: t
@spec repeat_while(t, t, call, opts) :: t
def repeat_while(combinator \\ empty(), to_repeat, while, opts \\ [])
when is_combinator(combinator) and is_combinator(to_repeat) and is_list(opts) do
non_empty!(to_repeat, "repeat_while")
compile_call!([], while, "repeat_while")
quoted_repeat_while(combinator, to_repeat, {__MODULE__, :__repeat_while__, [while]}, opts)
end
@doc """
Invokes `while` to emit the AST that will repeat `to_repeat`
while the AST code returns `{:cont, context}`.
In case repetition should stop, `while` must return `{:halt, context}`.
`while` is a `{module, function, args}` and it will receive 4
additional arguments. The AST representations of the binary to be
parsed, context, line and offset will be prepended to `args`. `while`
is invoked at compile time and is useful in combinators that avoid
injecting runtime dependencies.
"""
@spec quoted_repeat_while(t, mfargs) :: t
@spec quoted_repeat_while(t, t, mfargs) :: t
@spec quoted_repeat_while(t, t, mfargs, opts) :: t
def quoted_repeat_while(combinator \\ empty(), to_repeat, {_, _, _} = while, opts \\ [])
when is_combinator(combinator) and is_combinator(to_repeat) and is_list(opts) do
non_empty!(to_repeat, "quoted_repeat_while")
[{:repeat, Enum.reverse(to_repeat), while, opts[:gen_times]} | combinator]
end
@doc """
Allow the combinator given on `to_repeat` to appear at least, at most
or exactly a given amount of times.
## Examples
defmodule MyParser do
import NimbleParsec
defparsec :minimum_lower, times(ascii_char([?a..?z]), min: 2)
end
MyParser.minimum_lower("abcd")
#=> {:ok, [?a, ?b, ?c, ?d], "", %{}, {1, 0}, 4}
MyParser.minimum_lower("ab12")
#=> {:ok, [?a, ?b], "12", %{}, {1, 0}, 2}
MyParser.minimum_lower("a123")
#=> {:ok, [], "a123", %{}, {1, 0}, 0}
"""
@spec times(t, pos_integer | [min_and_max]) :: t
@spec times(t, t, pos_integer | [min_and_max]) :: t
def times(combinator \\ empty(), to_repeat, count_or_min_max)
def times(combinator, to_repeat, n)
when is_combinator(combinator) and is_combinator(to_repeat) and is_integer(n) and n >= 1 do
non_empty!(to_repeat, "times")
duplicate(combinator, to_repeat, n)
end
def times(combinator, to_repeat, opts)
when is_combinator(combinator) and is_combinator(to_repeat) and is_list(opts) do
{min, max} = validate_min_and_max!(opts)
non_empty!(to_repeat, "times")
combinator =
if min > 0 do
duplicate(combinator, to_repeat, min)
else
combinator
end
to_repeat = Enum.reverse(to_repeat)
combinator =
if max do
[{:times, to_repeat, max - min} | combinator]
else
[{:repeat, to_repeat, {__MODULE__, :__cont_context__, []}, opts[:gen_times]} | combinator]
end
combinator
end
@doc """
Chooses one of the given combinators.
Expects at least two choices.
## Beware! Char combinators
Note both `utf8_char/2` and `ascii_char/2` allow multiple ranges to
be given. Therefore, instead this:
choice([
ascii_char([?a..?z]),
ascii_char([?A..?Z]),
])
One should simply prefer:
ascii_char([?a..?z, ?A..?Z])
As the latter is compiled more efficiently by `NimbleParsec`.
## Beware! Always successful combinators
If a combinator that always succeeds is given as a choice, that choice
will always succeed which may lead to unused function warnings since
any further choice won't ever be attempted. For example, because `repeat/2`
always succeeds, the `string/2` combinator below it won't ever run:
choice([
repeat(ascii_char([?0..?9])),
string("OK")
])
Instead of `repeat/2`, you may want to use `times/3` with the flags `:min`
and `:max`.
## Beware! Overlapping choices
In case choices overlap, there is no guarantee which error will be the one
effectively returned. For example, imagine this choice:
choice([
string("<abc>foo</abc>"),
string("<abc>")
]
Since both choices can be activated for an input starting with "abc",
NimbleParsec guarantees it will return the error from one of them, but
not which.
"""
@spec choice(nonempty_list(t)) :: t
@spec choice(t, nonempty_list(t)) :: t
@spec choice(t, nonempty_list(t), opts) :: t
def choice(combinator \\ empty(), [_, _ | _] = choices, opts \\ [])
when is_combinator(combinator) do
choices = Enum.map(choices, &Enum.reverse/1)
weights = opts[:gen_weights]
if weights && length(weights) != length(choices) do
raise ArgumentError, ":gen_weights must be a list of the same size as choices"
end
[{:choice, choices, weights} | combinator]
end
@doc """
Marks the given combinator as `optional`.
It is equivalent to `choice([combinator, empty()])`.
"""
@spec optional(t) :: t
@spec optional(t, t) :: t
def optional(combinator \\ empty(), optional) do
choice(combinator, [optional, empty()])
end
@doc """
Defines a combinator to consume the next `n` bytes from the input.
## Examples
defmodule MyParser do
import NimbleParsec
defparsec :three_bytes, bytes(3)
end
MyParser.three_bytes("abc")
#=> {:ok, ["abc"], "", %{}, {1, 0}, 3}
MyParser.three_bytes("ab")
#=> {:error, "expected 3 bytes", "ab", %{}, {1, 0}, 0}
"""
@spec bytes(pos_integer) :: t
@spec bytes(t, pos_integer) :: t
def bytes(combinator \\ empty(), count)
when is_combinator(combinator) and is_integer(count) and count > 0 do
[{:bytes, count} | combinator]
end
## Helpers
defp validate_min_and_max!(count_or_opts, required_min \\ 0)
defp validate_min_and_max!(count, required_min)
when is_integer(count) do
validate_min_and_max!([min: count], required_min)
end
defp validate_min_and_max!(opts, required_min) do
min = opts[:min]
max = opts[:max]
cond do
min && max ->
validate_min_or_max!(:min, min, required_min)
validate_min_or_max!(:max, max, 1)
max <= min and
raise ArgumentError,
"expected :max to be strictly greater than :min, got: #{min} and #{max}"
min ->
validate_min_or_max!(:min, min, required_min)
max ->
validate_min_or_max!(:max, max, 1)
true ->
raise ArgumentError, "expected :min or :max to be given"
end
{min || required_min, max}
end
defp validate_min_or_max!(kind, value, min) do
unless is_integer(value) and value >= min do
raise ArgumentError,
"expected #{kind} to be an integer greater than or equal to #{min}, " <>
"got: #{inspect(value)}"
end
end
defp split_ranges!(ranges, context) do
Enum.split_with(ranges, &split_range!(&1, context))
end
defp split_range!(x, _context) when is_integer(x), do: true
defp split_range!(_.._//step, _context) when abs(step) == 1, do: true
defp split_range!({:not, x}, _context) when is_integer(x), do: false
defp split_range!({:not, _.._//step}, _context) when abs(step) == 1, do: false
defp split_range!(range, context) do
raise ArgumentError, "unknown range #{inspect(range)} given to #{context}"
end
defp compile_call!(extra, {module, function, args}, _context)
when is_atom(module) and is_atom(function) and is_list(args) do
quote do
unquote(module).unquote(function)(
unquote_splicing(extra),
unquote_splicing(Macro.escape(args))
)
end
end
defp compile_call!(extra, {function, args}, _context)
when is_atom(function) and is_list(args) do
quote do
unquote(function)(unquote_splicing(extra), unquote_splicing(Macro.escape(args)))
end
end
defp compile_call!(extra, function, _context) when is_atom(function) do
quote do
unquote(function)(unquote_splicing(extra))
end
end
defp compile_call!(_args, unknown, context) do
raise ArgumentError, "unknown call given to #{context}, got: #{inspect(unknown)}"
end
defp non_empty!([], action) do
raise ArgumentError, "cannot call #{action} on empty combinator"
end
defp non_empty!(combinator, action) do
if Enum.any?(combinator, &is_list/1) do
raise ArgumentError,
"invalid combinator given to #{action}, got a list of combinators instead"
end
end
## Inner combinators
defp quoted_constant_traverse(combinator, to_traverse, call) do
case to_traverse do
[{:traverse, inner_traverse, :constant, inner_call}] ->
[{:traverse, inner_traverse, :constant, [call | inner_call]} | combinator]
_ ->
[{:traverse, Enum.reverse(to_traverse), :constant, [call]} | combinator]
end
end
defp quoted_traverse(combinator, to_traverse, pre_or_pos, call) do
[{:traverse, Enum.reverse(to_traverse), pre_or_pos, [call]} | combinator]
end
defp bin_segment(combinator, inclusive, exclusive, modifier) do
[{:bin_segment, inclusive, exclusive, modifier} | combinator]
end
## Traverse callbacks
@doc false
def __pre_traverse__(rest, acc, context, line, offset, call) do
compile_call!([rest, acc, context, line, offset], call, "pre_traverse")
end
@doc false
def __post_traverse__(rest, acc, context, line, offset, call) do
compile_call!([rest, acc, context, line, offset], call, "post_traverse")
end
@doc false
def __lookahead__(rest, _acc, context, line, offset, call) do
compile_call!([rest, context, line, offset], call, "lookahead")
end
@doc false
def __wrap__(rest, acc, context, _line, _offset) do
{:{}, [], [rest, [reverse_now_or_later(acc)], context]}
end
@doc false
def __tag__(rest, acc, context, _line, _offset, tag) do
{:{}, [], [rest, [{tag, reverse_now_or_later(acc)}], context]}
end
@doc false
def __unwrap_and_tag__(rest, acc, context, _line, _offset, tag) when is_list(acc) do
case acc do
[one] -> {:{}, [], [rest, [{tag, one}], context]}
many -> raise "unwrap_and_tag/3 expected a single token, got: #{inspect(many)}"
end
end
def __unwrap_and_tag__(rest, acc, context, _line, _offset, tag) do
quoted =
quote do
case :lists.reverse(unquote(acc)) do
[one] -> one
many -> raise "unwrap_and_tag/3 expected a single token, got: #{inspect(many)}"
end
end
{:{}, [], [rest, [{tag, quoted}], context]}
end
@doc false
def __debug__(rest, acc, context, line, offset) do
quote bind_quoted: [rest: rest, acc: acc, context: context, line: line, offset: offset] do
IO.puts("""
== DEBUG ==
Bin: #{inspect(rest)}
Acc: #{inspect(:lists.reverse(acc))}
Ctx: #{inspect(context)}
Lin: #{inspect(line)}
Off: #{inspect(offset)}
""")
{rest, acc, context}
end
end
@doc false
def __constant__(rest, _acc, context, _line, _offset, constant) do
{:{}, [], [rest, constant, context]}
end
@doc false
def __line__(rest, acc, context, line, _offset) do
{:{}, [], [rest, [{reverse_now_or_later(acc), line}], context]}
end
@doc false
def __byte_offset__(rest, acc, context, _line, offset) do
{:{}, [], [rest, [{reverse_now_or_later(acc), offset}], context]}
end
@doc false
def __map__(rest, acc, context, _line, _offset, var, call) do
ast =
quote do
Enum.map(unquote(acc), fn unquote(var) -> unquote(call) end)
end
{:{}, [], [rest, ast, context]}
end
@doc false
def __reduce__(rest, acc, context, _line, _offset, call) do
{:{}, [], [rest, [compile_call!([reverse_now_or_later(acc)], call, "reduce")], context]}
end
## Repeat callbacks
@doc false
def __cont_context__(_rest, context, _line, _offset) do
{:cont, context}
end
@doc false
def __repeat_while__(quoted, context, line, offset, call) do
compile_call!([quoted, context, line, offset], call, "repeat_while")
end
## Chars callbacks
defp min_max_compile_runtime_chars(combinator, to_repeat, count, compile, _runtime, args)
when is_integer(count) and count >= 0 do
chars = duplicate(to_repeat, count)
quoted_post_traverse(combinator, chars, {__MODULE__, compile, [count | args]})
end
defp min_max_compile_runtime_chars(combinator, to_repeat, opts, compile, runtime, args)
when is_list(opts) do
{min, max} = validate_min_and_max!(opts)
chars =
if min > 0 do
min_max_compile_runtime_chars(empty(), to_repeat, min, compile, runtime, args)
else
empty()
end
chars =
if max do
times(chars, to_repeat, max: max - min)
else
repeat(chars, to_repeat)
end
quoted_post_traverse(combinator, chars, {__MODULE__, runtime, [min, max | args]})
end
@doc false
def __runtime_string__(rest, acc, context, _line, _offset, _min, _max, _type) do
ast = quote(do: List.to_string(unquote(reverse_now_or_later(acc))))
{:{}, [], [rest, [ast], context]}
end
@doc false
def __compile_string__(rest, acc, context, _line, _offset, _count, type) when is_list(acc) do
acc =
for entry <- :lists.reverse(acc) do
{:"::", [], [entry, type]}
end
{:{}, [], [rest, [{:<<>>, [], acc}], context]}
end
def __compile_string__(rest, acc, context, _line, _offset, _count, _type) do
ast = quote(do: List.to_string(unquote(reverse_now_or_later(acc))))
{:{}, [], [rest, [ast], context]}
end
@doc false
def __runtime_integer__(rest, acc, context, _line, _offset, min, _max)
when is_integer(min) and min > 0 do
ast =
quote do
[head | tail] = unquote(reverse_now_or_later(acc))
[:lists.foldl(fn x, acc -> x - ?0 + acc * 10 end, head, tail)]
end
{:{}, [], [rest, ast, context]}
end
def __runtime_integer__(rest, acc, context, _line, _offset, _min, _max) do
ast =
quote do
[head | tail] = unquote(reverse_now_or_later(acc))
[:lists.foldl(fn x, acc -> x - ?0 + acc * 10 end, head - ?0, tail)]
end
{:{}, [], [rest, ast, context]}
end
@doc false
def __compile_integer__(rest, acc, context, _line, _offset, _count) when is_list(acc) do
ast =
acc
|> quoted_ascii_to_integer(1)
|> Enum.reduce(&{:+, [], [&2, &1]})
{:{}, [], [rest, [ast], context]}
end
defp reverse_now_or_later(list) when is_list(list), do: :lists.reverse(list)
defp reverse_now_or_later(expr), do: quote(do: :lists.reverse(unquote(expr)))
defp quoted_ascii_to_integer([var | vars], 1) do
[quote(do: unquote(var) - ?0) | quoted_ascii_to_integer(vars, 10)]
end
defp quoted_ascii_to_integer([var | vars], index) do
[quote(do: (unquote(var) - ?0) * unquote(index)) | quoted_ascii_to_integer(vars, index * 10)]
end
defp quoted_ascii_to_integer([], _index) do
[]
end
end