new parser

This commit is contained in:
2022-08-01 15:58:55 +02:00
parent e0fd6051e3
commit 1cb97fa7f7
8 changed files with 378 additions and 3 deletions

View File

@ -0,0 +1,188 @@
defmodule SomethingErlang.AwfulApi do
require Logger
alias SomethingErlang.AwfulApi.Client
alias SomethingErlang.AwfulApi.Thread
def parse_thread_userinfo(thread) do
{_, userinfos} =
Floki.find(thread, "td.userinfo dl.userinfo")
|> Floki.traverse_and_update([], fn
{"dt", [{"class", _class} | _rest], children}, acc ->
{nil, [{:name, Floki.text(children)} | acc]}
{"dd", [{"class", "registered"} | _rest], children}, acc ->
{nil, [{:regdate, Floki.text(children)} | acc]}
{"dd", [{"class", "title"} | _rest], children}, acc ->
{nil, [{:title, Floki.raw_html(children)} | acc]}
other, acc ->
{other, acc}
end)
Enum.chunk_every(userinfos, 3)
|> Enum.map(&Map.new/1)
|> Enum.reverse()
end
def parse_thread_postdate(thread) do
{[], postdates} =
Floki.find(thread, "td.postdate")
|> Floki.traverse_and_update([], fn
{"td", [{"class", "postdate"}], children}, acc ->
{nil, [Floki.text(children) | acc]}
other, acc ->
{other, acc}
end)
Enum.reverse(postdates)
end
def parse_thread_postbody(thread) do
{[], postbodies} =
Floki.find(thread, "td.postbody")
|> Floki.traverse_and_update(fn
{"img", attrs, []} -> transform(:img, attrs)
{"a", attrs, children} -> transform(:a, attrs, children)
{:comment, _} -> nil
other -> other
end)
# TODO: use Floki find or smth?
|> Floki.traverse_and_update([], fn
{"td", [{"class", "postbody"}], children}, acc ->
{nil, [Floki.raw_html(children) | acc]}
other, acc ->
{other, acc}
end)
Enum.reverse(postbodies)
end
defp transform(elem, attr, children \\ [])
defp transform(:img, attrs, _children) do
{"class", class} = List.keyfind(attrs, "class", 0, {"class", ""})
if class == "sa-smilie" do
{"img", attrs, []}
else
t_attrs = List.keyreplace(attrs, "class", 0, {"class", "img-responsive"})
{"img", [{"loading", "lazy"} | t_attrs], []}
end
end
defp transform(:a, attrs, children) do
{"href", href} = List.keyfind(attrs, "href", 0, {"href", ""})
cond do
# skip internal links
String.starts_with?(href, "/") ->
{"a", [{"href", href}], children}
# mp4
String.ends_with?(href, ".mp4") ->
transform_link(:mp4, href)
# gifv
String.ends_with?(href, ".gifv") ->
transform_link(:gifv, href)
# youtube
String.starts_with?(href, "https://www.youtube.com/watch") ->
transform_link(:ytlong, href)
String.starts_with?(href, "https://youtu.be/") ->
transform_link(:ytshort, href)
true ->
Logger.debug "no transform for #{href}"
{"a", [{"href", href}], children}
end
end
defp transform_link(:mp4, href),
do: {"div", [{"class", "responsive-embed"}],
[{"video", [{"class", "img-responsive"}, {"controls", ""}],
[{"source", [{"src", href}, {"type", "video/mp4"}], []}]
}]
}
defp transform_link(:gifv, href),
do: {"div", [{"class", "responsive-embed"}],
[{"video", [{"class", "img-responsive"}, {"controls", ""}],
[{"source", [{"src", String.replace(href, ".gifv", ".webm")},
{"type", "video/webm"}], []},
{"source", [{"src", String.replace(href, ".gifv", ".mp4")},
{"type", "video/mp4"}], []}]
}]
}
defp transform_link(:ytlong, href) do
String.replace(href, "/watch?v=", "/embed/")
|> youtube_iframe()
end
defp transform_link(:ytshort, href) do
String.replace(href, "youtu.be/", "www.youtube.com/embed/")
|> youtube_iframe()
end
defp youtube_iframe(src),
do: {"div", [{"class", "responsive-embed"}],
[{"iframe",
[
{"class", "youtube-player"},
{"loading", "lazy"},
{"allow", "fullscreen"},
{"src", src}
], []}
]}
@doc """
Returns a list of all posts on page of a thread.
## Examples
iex> t = AwfulApi.parsed_thread(3945300, 1)
iex> length(t.posts)
42
iex> t.page_count
12
"""
def parsed_thread(id, page, user) do
Thread.compile(id, page, user)
end
def parsed_thread(id, page, user, :deprecated) do
doc = Client.thread_doc(id, page, user)
html = Floki.parse_document!(doc)
thread = Floki.find(html, "#thread") |> Floki.filter_out("table.post.ignored")
|> tap(&IO.inspect(Floki.find(&1, "table.post") |> Enum.take(2)))
title = Floki.find(html, "title") |> Floki.text()
title = title |> String.replace(" - The Something Awful Forums", "")
page_count =
case Floki.find(html, "#content .pages.top option:last-of-type") |> Floki.text() do
"" -> 1
s -> String.to_integer(s)
end
posts =
Enum.zip([
parse_thread_userinfo(thread),
parse_thread_postdate(thread),
parse_thread_postbody(thread)
])
|> Enum.map(fn {ui, pd, pb} ->
%{:userinfo => ui, :postdate => pd, :postbody => pb}
end)
%{id: id,
title: title,
page: page,
page_count: page_count,
posts: posts}
end
end