defmodule Grazer.HTML do def distill(raw) when is_binary(raw) do {:ok, parsed} = Floki.parse_document(raw) # there better be only one body [{_tag, _attrs, children}] = Floki.find(parsed, "body") {:ok, flatten(children)} end defp flatten(children) do f = flatten([], children) #IO.puts(inspect(f)) f end defp flatten(acc, []) do acc end defp flatten(acc, [{:comment, _} | rest]) do flatten(acc, rest) end defp flatten(acc, [{"script", _, _} | rest]) do flatten(acc, rest) end defp flatten(acc, [{"h1", _, children} | rest]) do acc ++ [[:h1, [], [text(children)]]] |> flatten(rest) end defp flatten(acc, [{"h2", _, children} | rest]) do acc ++ [[:h1, [], [text(children)]]] |> flatten(rest) end defp flatten(acc, [{"h3", _, children} | rest]) do acc ++ [[:h1, [], [text(children)]]] |> flatten(rest) end defp flatten(acc, [{"br", _, _} | rest]) do acc ++ [[:p, [], [""]]] |> flatten(rest) end defp flatten(acc, [{"hr", _, _} | rest]) do acc ++ [[:p, [], ["โโโ"]]] |> flatten(rest) end defp flatten(acc, [{"ul", _, children} | rest]) do acc ++ list(children) |> flatten(rest) end defp flatten(acc, [{"ol", _, children} | rest]) do acc ++ list(children) |> flatten(rest) end defp flatten(acc, [{"p", _, children} | rest]) do #acc ++ [[:p, [], interleave(" ", flatten([], children))]] acc ++ [[:p, [], flatten([], children)]] |> flatten(rest) end defp flatten(acc, [{"a", attrs, children} | rest]) do #IO.puts(inspect(attrs)) acc ++ [[:a, [href: attr(attrs, "href")], [text(children)]]] |> flatten(rest) end defp flatten(acc, [{"strong", _, children} | rest]) do acc ++ ["**" <> text(children) <> "**"] |> flatten(rest) end defp flatten(acc, [{"b", _, children} | rest]) do acc ++ ["**" <> text(children) <> "**"] |> flatten(rest) end defp flatten(acc, [{"em", _, children} | rest]) do acc ++ ["__" <> text(children) <> "__"] |> flatten(rest) end defp flatten(acc, [{"i", _, children} | rest]) do acc ++ ["__" <> text(children) <> "__"] |> flatten(rest) end defp flatten(acc, [text | rest]) when is_binary(text) do acc ++ [text([text])] |> flatten(rest) end defp flatten(acc, [{tag, _attrs, children} | rest]) do if is_divvy(tag) do flatten(acc, children) |> flatten(rest) else #IO.puts(inspect({:ignore, tag})) flatten(acc, rest) end end defp is_divvy("aside") do true end defp is_divvy("blockquote") do true end defp is_divvy("div") do true end defp is_divvy("footer") do true end defp is_divvy("header") do true end defp is_divvy("main") do true end defp is_divvy("nav") do true end defp is_divvy("noscript") do true end defp is_divvy("section") do true end defp is_divvy("span") do true end defp is_divvy(_) do false end defp text([]) do "" end defp text([{_, _, children} | rest]) do text(children) <> text(rest) end defp text([str | rest]) when is_binary str do Regex.replace(~r/\A\s+/m, str, " ") <> text(rest) end defp list([]) do [] end defp list([{"li", _, children} | rest]) do [[:p, [], [" ยท " | flatten(children)]] | list(rest)] end defp interleave(sp, []) do [] end defp interleave(sp, [x]) do [x] end defp interleave(sp, [x | [y | rest]]) do [x | [sp | interleave(sp, [y | rest])]] end defp attr(kvs, attr, default \\ nil) defp attr([], _, default) do default end defp attr([{k, v} | _], attr, _) when k == attr do v end defp attr([_ | rest], attr, default) do attr(rest, attr, default) end end